From d849e38200d445c692015dba844ede63fddb91bc Mon Sep 17 00:00:00 2001 From: Evelyn Yen Date: Wed, 18 Feb 2026 20:44:32 -0500 Subject: [PATCH 1/5] set sequence-length-is-longer-than-the-specified-maximum to false --- transformer_lens/utils.py | 121 +++++++++++++++++++++----------------- 1 file changed, 68 insertions(+), 53 deletions(-) diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py index d13371cc9..f2a38dd8b 100644 --- a/transformer_lens/utils.py +++ b/transformer_lens/utils.py @@ -351,62 +351,77 @@ def tokenize_and_concatenate( if tokenizer.pad_token is None: # We add a padding token, purely to implement the tokenizer. This will be removed before inputting tokens to the model, so we do not need to increment d_vocab in the model. tokenizer.add_special_tokens({"pad_token": ""}) - # Define the length to chop things up into - leaving space for a bos_token if required - if add_bos_token: - seq_len = max_length - 1 - else: - seq_len = max_length - - def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]: - text = examples[column_name] - # Concatenate it all into an enormous string, separated by eos_tokens - assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token." - full_text = tokenizer.eos_token.join(text) - - # Handle the case when full_text is empty - if not full_text.strip(): - return {"tokens": np.array([], dtype=np.int64)} - - # Divide into 20 chunks of ~ equal length - num_chunks = 20 - chunk_length = (len(full_text) - 1) // num_chunks + 1 - chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)] - # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned - tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten() - # Drop padding tokens - tokens = tokens[tokens != tokenizer.pad_token_id] - num_tokens = len(tokens) - - # Handle cases where num_tokens is less than seq_len - if num_tokens < seq_len: - num_batches = 1 - # Pad tokens if necessary - tokens = tokens[:seq_len] - if len(tokens) < seq_len: - padding_length = seq_len - len(tokens) - padding = np.full(padding_length, tokenizer.pad_token_id) - tokens = np.concatenate([tokens, padding], axis=0) - else: - num_batches = num_tokens // seq_len - # Drop the final tokens if not enough to make a full sequence - tokens = tokens[: seq_len * num_batches] - tokens = einops.rearrange( - tokens, "(batch seq) -> batch seq", batch=num_batches, seq=seq_len + # Suppress the "sequence length longer than maximum" warning during chunked tokenization. + _deprecation_warnings_saved = None + if hasattr(tokenizer, "deprecation_warnings") and isinstance( + tokenizer.deprecation_warnings, dict + ): + _deprecation_warnings_saved = tokenizer.deprecation_warnings.copy() + tokenizer.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = ( + False ) + try: + # Define the length to chop things up into - leaving space for a bos_token if required if add_bos_token: - prefix = np.full((num_batches, 1), tokenizer.bos_token_id) - tokens = np.concatenate([prefix, tokens], axis=1) - return {"tokens": tokens} - - tokenized_dataset = dataset.map( - tokenize_function, - batched=True, - num_proc=(num_proc if not streaming else None), - remove_columns=[column_name], - ) - tokenized_dataset.set_format(type="torch", columns=["tokens"]) - return tokenized_dataset + seq_len = max_length - 1 + else: + seq_len = max_length + + def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]: + text = examples[column_name] + # Concatenate it all into an enormous string, separated by eos_tokens + assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token." + full_text = tokenizer.eos_token.join(text) + + # Handle the case when full_text is empty + if not full_text.strip(): + return {"tokens": np.array([], dtype=np.int64)} + + # Divide into 20 chunks of ~ equal length + num_chunks = 20 + chunk_length = (len(full_text) - 1) // num_chunks + 1 + chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)] + # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned + tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten() + # Drop padding tokens + tokens = tokens[tokens != tokenizer.pad_token_id] + num_tokens = len(tokens) + + # Handle cases where num_tokens is less than seq_len + if num_tokens < seq_len: + num_batches = 1 + # Pad tokens if necessary + tokens = tokens[:seq_len] + if len(tokens) < seq_len: + padding_length = seq_len - len(tokens) + padding = np.full(padding_length, tokenizer.pad_token_id) + tokens = np.concatenate([tokens, padding], axis=0) + else: + num_batches = num_tokens // seq_len + # Drop the final tokens if not enough to make a full sequence + tokens = tokens[: seq_len * num_batches] + + tokens = einops.rearrange( + tokens, "(batch seq) -> batch seq", batch=num_batches, seq=seq_len + ) + if add_bos_token: + prefix = np.full((num_batches, 1), tokenizer.bos_token_id) + tokens = np.concatenate([prefix, tokens], axis=1) + return {"tokens": tokens} + + tokenized_dataset = dataset.map( + tokenize_function, + batched=True, + num_proc=(num_proc if not streaming else None), + remove_columns=[column_name], + ) + tokenized_dataset.set_format(type="torch", columns=["tokens"]) + return tokenized_dataset + finally: + if _deprecation_warnings_saved is not None: + tokenizer.deprecation_warnings.clear() + tokenizer.deprecation_warnings.update(_deprecation_warnings_saved) def sample_logits( From 61fccf508c142ee81717918894b6d9e3aa35f024 Mon Sep 17 00:00:00 2001 From: Evelyn Yen Date: Wed, 18 Feb 2026 20:53:28 -0500 Subject: [PATCH 2/5] add test --- tests/unit/test_utils.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index 242df3987..4fe632802 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -1,3 +1,5 @@ +import warnings + import numpy as np import pytest import torch @@ -433,3 +435,40 @@ def test_init_xavier_normal(self, d_model, d_mlp): x_new = nn.Parameter(torch.empty(2, d_model, 137)) utils.init_xavier_normal_(x_new) assert torch.allclose(x_new, x, rtol=1e-2) + + +def test_tokenize_and_concatenate_no_spurious_sequence_length_warning(): + """Test that tokenize_and_concatenate does not emit the HF 'sequence length longer than maximum' warning.""" + from datasets import Dataset + from transformers import AutoTokenizer + + # Use a tokenizer with model_max_length and EOS + tokenizer = AutoTokenizer.from_pretrained("t5-small") + assert tokenizer.model_max_length == 512 + assert tokenizer.eos_token is not None + + # Long text so that when split into 20 chunks, at least one chunk tokenizes to > 512 tokens + long_text = "word " * 20000 + dataset = Dataset.from_dict({"text": [long_text]}) + + with warnings.catch_warnings(record=True) as recorded: + warnings.simplefilter("always") + result = utils.tokenize_and_concatenate( + dataset, + tokenizer, + max_length=tokenizer.model_max_length, + add_bos_token=False, + ) + + # No warning about sequence length exceeding model maximum + for w in recorded: + msg = str(w.message) + assert "longer than the specified maximum" not in msg, ( + f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}" + ) + + # Sanity: output has expected shape (batch, max_length); result is a Dataset + assert len(result) >= 1 + first_row = result[0]["tokens"] + assert first_row.shape[0] == tokenizer.model_max_length + assert first_row.dim() == 1 From 46103587b907c65f83a32ab094de897ed212d5bb Mon Sep 17 00:00:00 2001 From: Evelyn Yen Date: Wed, 18 Feb 2026 21:34:00 -0500 Subject: [PATCH 3/5] fix test --- transformer_lens/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py index f2a38dd8b..133bf4de1 100644 --- a/transformer_lens/utils.py +++ b/transformer_lens/utils.py @@ -368,7 +368,8 @@ def tokenize_and_concatenate( else: seq_len = max_length - def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]: + def tokenize_function(examples: Any) -> dict[str, np.ndarray]: + # datasets.map() may pass a LazyBatch, not a plain dict; accept dict-like batches text = examples[column_name] # Concatenate it all into an enormous string, separated by eos_tokens assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token." From 34d1d33e27a5072c15003a600f277fe136d3b7bc Mon Sep 17 00:00:00 2001 From: Evelyn Yen Date: Thu, 19 Feb 2026 17:16:02 -0500 Subject: [PATCH 4/5] fix tests --- tests/unit/test_utils.py | 7 +-- transformer_lens/HookedTransformer.py | 7 +-- transformer_lens/utils.py | 65 ++++++--------------------- 3 files changed, 19 insertions(+), 60 deletions(-) diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py index a4d349ca0..02ee73807 100644 --- a/tests/unit/test_utils.py +++ b/tests/unit/test_utils.py @@ -458,14 +458,15 @@ def test_tokenize_and_concatenate_no_spurious_sequence_length_warning(): tokenizer, max_length=tokenizer.model_max_length, add_bos_token=False, + streaming=True, ) # No warning about sequence length exceeding model maximum for w in recorded: msg = str(w.message) - assert "longer than the specified maximum" not in msg, ( - f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}" - ) + assert ( + "longer than the specified maximum" not in msg + ), f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}" # Sanity: output has expected shape (batch, max_length); result is a Dataset assert len(result) >= 1 diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index e39a02eea..f69f96e4f 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -583,12 +583,7 @@ def forward( self, prepend_bos=prepend_bos, padding_side=padding_side ): if start_at_layer is None: - ( - residual, - tokens, - shortformer_pos_embed, - attention_mask, - ) = self.input_to_embed( + (residual, tokens, shortformer_pos_embed, attention_mask,) = self.input_to_embed( input, prepend_bos=prepend_bos, padding_side=padding_side, diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py index b694b35a5..7d5712ea0 100644 --- a/transformer_lens/utils.py +++ b/transformer_lens/utils.py @@ -353,57 +353,13 @@ def tokenize_and_concatenate( # We add a padding token, purely to implement the tokenizer. This will be removed before inputting tokens to the model, so we do not need to increment d_vocab in the model. tokenizer.add_special_tokens({"pad_token": ""}) - # Define the length to chop things up into - leaving space for a bos_token if required - if add_bos_token: - seq_len = max_length - 1 - else: - seq_len = max_length - - def tokenize_function(examples: Any) -> dict[str, np.ndarray]: - # datasets.map() may pass a LazyBatch, not a plain dict; accept dict-like batches - text = examples[column_name] - # Concatenate it all into an enormous string, separated by eos_tokens - assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token." - full_text = tokenizer.eos_token.join(text) - - # Handle the case when full_text is empty - if not full_text.strip(): - return {"tokens": np.array([], dtype=np.int64)} - - # Divide into 20 chunks of ~ equal length - num_chunks = 20 - chunk_length = (len(full_text) - 1) // num_chunks + 1 - chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)] - # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned - tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten() - # Drop padding tokens - tokens = tokens[tokens != tokenizer.pad_token_id] - num_tokens = len(tokens) - - # Handle cases where num_tokens is less than seq_len - if num_tokens < seq_len: - num_batches = 1 - # Pad tokens if necessary. Use eos_token_id if the model has no pad token. - tokens = tokens[:seq_len] - if len(tokens) < seq_len: - padding_length = seq_len - len(tokens) - padding_id = tokenizer.eos_token_id if not has_pad_token else tokenizer.pad_token_id - padding = np.full(padding_length, padding_id) - tokens = np.concatenate([tokens, padding], axis=0) - else: - num_batches = num_tokens // seq_len - # Drop the final tokens if not enough to make a full sequence - tokens = tokens[: seq_len * num_batches] - # Suppress the "sequence length longer than maximum" warning during chunked tokenization. _deprecation_warnings_saved = None - if hasattr(tokenizer, "deprecation_warnings") and isinstance( - tokenizer.deprecation_warnings, dict - ): + if hasattr(tokenizer, "deprecation_warnings"): _deprecation_warnings_saved = tokenizer.deprecation_warnings.copy() - tokenizer.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = ( - False - ) + tokenizer.deprecation_warnings[ + "sequence-length-is-longer-than-the-specified-maximum" + ] = False try: # Define the length to chop things up into - leaving space for a bos_token if required if add_bos_token: @@ -425,7 +381,9 @@ def tokenize_function(examples: Any) -> dict[str, np.ndarray]: # Divide into 20 chunks of ~ equal length num_chunks = 20 chunk_length = (len(full_text) - 1) // num_chunks + 1 - chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)] + chunks = [ + full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks) + ] # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten() # Drop padding tokens @@ -439,7 +397,10 @@ def tokenize_function(examples: Any) -> dict[str, np.ndarray]: tokens = tokens[:seq_len] if len(tokens) < seq_len: padding_length = seq_len - len(tokens) - padding = np.full(padding_length, tokenizer.pad_token_id) + padding_id = ( + tokenizer.eos_token_id if not has_pad_token else tokenizer.pad_token_id + ) + padding = np.full(padding_length, padding_id) tokens = np.concatenate([tokens, padding], axis=0) else: num_batches = num_tokens // seq_len @@ -535,7 +496,9 @@ def sample_logits( SliceInput = Optional[ Union[ int, - Tuple[int,], + Tuple[ + int, + ], Tuple[int, int], Tuple[int, int, int], List[int], From 86a94679c6eb79663ca3c1488005d2dee1abffdd Mon Sep 17 00:00:00 2001 From: Evelyn Yen Date: Thu, 19 Feb 2026 17:25:27 -0500 Subject: [PATCH 5/5] fix formatting --- transformer_lens/HookedTransformer.py | 7 ++++++- transformer_lens/utils.py | 4 +--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py index f69f96e4f..e39a02eea 100644 --- a/transformer_lens/HookedTransformer.py +++ b/transformer_lens/HookedTransformer.py @@ -583,7 +583,12 @@ def forward( self, prepend_bos=prepend_bos, padding_side=padding_side ): if start_at_layer is None: - (residual, tokens, shortformer_pos_embed, attention_mask,) = self.input_to_embed( + ( + residual, + tokens, + shortformer_pos_embed, + attention_mask, + ) = self.input_to_embed( input, prepend_bos=prepend_bos, padding_side=padding_side, diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py index 7d5712ea0..414a6b99f 100644 --- a/transformer_lens/utils.py +++ b/transformer_lens/utils.py @@ -496,9 +496,7 @@ def sample_logits( SliceInput = Optional[ Union[ int, - Tuple[ - int, - ], + Tuple[int,], Tuple[int, int], Tuple[int, int, int], List[int],