From d849e38200d445c692015dba844ede63fddb91bc Mon Sep 17 00:00:00 2001
From: Evelyn Yen <evelyn.chin179@gmail.com>
Date: Wed, 18 Feb 2026 20:44:32 -0500
Subject: [PATCH 1/5] set sequence-length-is-longer-than-the-specified-maximum
 to false

---
 transformer_lens/utils.py | 121 +++++++++++++++++++++-----------------
 1 file changed, 68 insertions(+), 53 deletions(-)
diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py
index d13371cc9..f2a38dd8b 100644
--- a/transformer_lens/utils.py
+++ b/transformer_lens/utils.py
@@ -351,62 +351,77 @@ def tokenize_and_concatenate(
     if tokenizer.pad_token is None:
         # We add a padding token, purely to implement the tokenizer. This will be removed before inputting tokens to the model, so we do not need to increment d_vocab in the model.
         tokenizer.add_special_tokens({"pad_token": "<PAD>"})
-    # Define the length to chop things up into - leaving space for a bos_token if required
-    if add_bos_token:
-        seq_len = max_length - 1
-    else:
-        seq_len = max_length
-
-    def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]:
-        text = examples[column_name]
-        # Concatenate it all into an enormous string, separated by eos_tokens
-        assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token."
-        full_text = tokenizer.eos_token.join(text)
-
-        # Handle the case when full_text is empty
-        if not full_text.strip():
-            return {"tokens": np.array([], dtype=np.int64)}
-
-        # Divide into 20 chunks of ~ equal length
-        num_chunks = 20
-        chunk_length = (len(full_text) - 1) // num_chunks + 1
-        chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)]
-        # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned
-        tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten()
-        # Drop padding tokens
-        tokens = tokens[tokens != tokenizer.pad_token_id]
-        num_tokens = len(tokens)
-
-        # Handle cases where num_tokens is less than seq_len
-        if num_tokens < seq_len:
-            num_batches = 1
-            # Pad tokens if necessary
-            tokens = tokens[:seq_len]
-            if len(tokens) < seq_len:
-                padding_length = seq_len - len(tokens)
-                padding = np.full(padding_length, tokenizer.pad_token_id)
-                tokens = np.concatenate([tokens, padding], axis=0)
-        else:
-            num_batches = num_tokens // seq_len
-            # Drop the final tokens if not enough to make a full sequence
-            tokens = tokens[: seq_len * num_batches]
 
-        tokens = einops.rearrange(
-            tokens, "(batch seq) -> batch seq", batch=num_batches, seq=seq_len
+    # Suppress the "sequence length longer than maximum" warning during chunked tokenization.
+    _deprecation_warnings_saved = None
+    if hasattr(tokenizer, "deprecation_warnings") and isinstance(
+        tokenizer.deprecation_warnings, dict
+    ):
+        _deprecation_warnings_saved = tokenizer.deprecation_warnings.copy()
+        tokenizer.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = (
+            False
         )
+    try:
+        # Define the length to chop things up into - leaving space for a bos_token if required
         if add_bos_token:
-            prefix = np.full((num_batches, 1), tokenizer.bos_token_id)
-            tokens = np.concatenate([prefix, tokens], axis=1)
-        return {"tokens": tokens}
-
-    tokenized_dataset = dataset.map(
-        tokenize_function,
-        batched=True,
-        num_proc=(num_proc if not streaming else None),
-        remove_columns=[column_name],
-    )
-    tokenized_dataset.set_format(type="torch", columns=["tokens"])
-    return tokenized_dataset
+            seq_len = max_length - 1
+        else:
+            seq_len = max_length
+
+        def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]:
+            text = examples[column_name]
+            # Concatenate it all into an enormous string, separated by eos_tokens
+            assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token."
+            full_text = tokenizer.eos_token.join(text)
+
+            # Handle the case when full_text is empty
+            if not full_text.strip():
+                return {"tokens": np.array([], dtype=np.int64)}
+
+            # Divide into 20 chunks of ~ equal length
+            num_chunks = 20
+            chunk_length = (len(full_text) - 1) // num_chunks + 1
+            chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)]
+            # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned
+            tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten()
+            # Drop padding tokens
+            tokens = tokens[tokens != tokenizer.pad_token_id]
+            num_tokens = len(tokens)
+
+            # Handle cases where num_tokens is less than seq_len
+            if num_tokens < seq_len:
+                num_batches = 1
+                # Pad tokens if necessary
+                tokens = tokens[:seq_len]
+                if len(tokens) < seq_len:
+                    padding_length = seq_len - len(tokens)
+                    padding = np.full(padding_length, tokenizer.pad_token_id)
+                    tokens = np.concatenate([tokens, padding], axis=0)
+            else:
+                num_batches = num_tokens // seq_len
+                # Drop the final tokens if not enough to make a full sequence
+                tokens = tokens[: seq_len * num_batches]
+
+            tokens = einops.rearrange(
+                tokens, "(batch seq) -> batch seq", batch=num_batches, seq=seq_len
+            )
+            if add_bos_token:
+                prefix = np.full((num_batches, 1), tokenizer.bos_token_id)
+                tokens = np.concatenate([prefix, tokens], axis=1)
+            return {"tokens": tokens}
+
+        tokenized_dataset = dataset.map(
+            tokenize_function,
+            batched=True,
+            num_proc=(num_proc if not streaming else None),
+            remove_columns=[column_name],
+        )
+        tokenized_dataset.set_format(type="torch", columns=["tokens"])
+        return tokenized_dataset
+    finally:
+        if _deprecation_warnings_saved is not None:
+            tokenizer.deprecation_warnings.clear()
+            tokenizer.deprecation_warnings.update(_deprecation_warnings_saved)
 
 
 def sample_logits(

From 61fccf508c142ee81717918894b6d9e3aa35f024 Mon Sep 17 00:00:00 2001
From: Evelyn Yen <evelyn.chin179@gmail.com>
Date: Wed, 18 Feb 2026 20:53:28 -0500
Subject: [PATCH 2/5] add test

---
 tests/unit/test_utils.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 242df3987..4fe632802 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,3 +1,5 @@
+import warnings
+
 import numpy as np
 import pytest
 import torch
@@ -433,3 +435,40 @@ def test_init_xavier_normal(self, d_model, d_mlp):
         x_new = nn.Parameter(torch.empty(2, d_model, 137))
         utils.init_xavier_normal_(x_new)
         assert torch.allclose(x_new, x, rtol=1e-2)
+
+
+def test_tokenize_and_concatenate_no_spurious_sequence_length_warning():
+    """Test that tokenize_and_concatenate does not emit the HF 'sequence length longer than maximum' warning."""
+    from datasets import Dataset
+    from transformers import AutoTokenizer
+
+    # Use a tokenizer with model_max_length and EOS
+    tokenizer = AutoTokenizer.from_pretrained("t5-small")
+    assert tokenizer.model_max_length == 512
+    assert tokenizer.eos_token is not None
+
+    # Long text so that when split into 20 chunks, at least one chunk tokenizes to > 512 tokens
+    long_text = "word " * 20000
+    dataset = Dataset.from_dict({"text": [long_text]})
+
+    with warnings.catch_warnings(record=True) as recorded:
+        warnings.simplefilter("always")
+        result = utils.tokenize_and_concatenate(
+            dataset,
+            tokenizer,
+            max_length=tokenizer.model_max_length,
+            add_bos_token=False,
+        )
+
+    # No warning about sequence length exceeding model maximum
+    for w in recorded:
+        msg = str(w.message)
+        assert "longer than the specified maximum" not in msg, (
+            f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}"
+        )
+
+    # Sanity: output has expected shape (batch, max_length); result is a Dataset
+    assert len(result) >= 1
+    first_row = result[0]["tokens"]
+    assert first_row.shape[0] == tokenizer.model_max_length
+    assert first_row.dim() == 1

From 46103587b907c65f83a32ab094de897ed212d5bb Mon Sep 17 00:00:00 2001
From: Evelyn Yen <evelyn.chin179@gmail.com>
Date: Wed, 18 Feb 2026 21:34:00 -0500
Subject: [PATCH 3/5] fix test

---
 transformer_lens/utils.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py
index f2a38dd8b..133bf4de1 100644
--- a/transformer_lens/utils.py
+++ b/transformer_lens/utils.py
@@ -368,7 +368,8 @@ def tokenize_and_concatenate(
         else:
             seq_len = max_length
 
-        def tokenize_function(examples: dict[str, list[str]]) -> dict[str, np.ndarray]:
+        def tokenize_function(examples: Any) -> dict[str, np.ndarray]:
+            # datasets.map() may pass a LazyBatch, not a plain dict; accept dict-like batches
             text = examples[column_name]
             # Concatenate it all into an enormous string, separated by eos_tokens
             assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token."

From 34d1d33e27a5072c15003a600f277fe136d3b7bc Mon Sep 17 00:00:00 2001
From: Evelyn Yen <evelyn.chin179@gmail.com>
Date: Thu, 19 Feb 2026 17:16:02 -0500
Subject: [PATCH 4/5] fix tests

---
 tests/unit/test_utils.py              |  7 +--
 transformer_lens/HookedTransformer.py |  7 +--
 transformer_lens/utils.py             | 65 ++++++---------------------
 3 files changed, 19 insertions(+), 60 deletions(-)

diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index a4d349ca0..02ee73807 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -458,14 +458,15 @@ def test_tokenize_and_concatenate_no_spurious_sequence_length_warning():
             tokenizer,
             max_length=tokenizer.model_max_length,
             add_bos_token=False,
+            streaming=True,
         )
 
     # No warning about sequence length exceeding model maximum
     for w in recorded:
         msg = str(w.message)
-        assert "longer than the specified maximum" not in msg, (
-            f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}"
-        )
+        assert (
+            "longer than the specified maximum" not in msg
+        ), f"tokenize_and_concatenate should not emit sequence-length warning; got: {msg}"
 
     # Sanity: output has expected shape (batch, max_length); result is a Dataset
     assert len(result) >= 1
diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py
index e39a02eea..f69f96e4f 100644
--- a/transformer_lens/HookedTransformer.py
+++ b/transformer_lens/HookedTransformer.py
@@ -583,12 +583,7 @@ def forward(
             self, prepend_bos=prepend_bos, padding_side=padding_side
         ):
             if start_at_layer is None:
-                (
-                    residual,
-                    tokens,
-                    shortformer_pos_embed,
-                    attention_mask,
-                ) = self.input_to_embed(
+                (residual, tokens, shortformer_pos_embed, attention_mask,) = self.input_to_embed(
                     input,
                     prepend_bos=prepend_bos,
                     padding_side=padding_side,
diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py
index b694b35a5..7d5712ea0 100644
--- a/transformer_lens/utils.py
+++ b/transformer_lens/utils.py
@@ -353,57 +353,13 @@ def tokenize_and_concatenate(
         # We add a padding token, purely to implement the tokenizer. This will be removed before inputting tokens to the model, so we do not need to increment d_vocab in the model.
         tokenizer.add_special_tokens({"pad_token": "<PAD>"})
 
-    # Define the length to chop things up into - leaving space for a bos_token if required
-    if add_bos_token:
-        seq_len = max_length - 1
-    else:
-        seq_len = max_length
-
-    def tokenize_function(examples: Any) -> dict[str, np.ndarray]:
-        # datasets.map() may pass a LazyBatch, not a plain dict; accept dict-like batches
-        text = examples[column_name]
-        # Concatenate it all into an enormous string, separated by eos_tokens
-        assert tokenizer.eos_token is not None, "Tokenizer must have an EOS token."
-        full_text = tokenizer.eos_token.join(text)
-
-        # Handle the case when full_text is empty
-        if not full_text.strip():
-            return {"tokens": np.array([], dtype=np.int64)}
-
-        # Divide into 20 chunks of ~ equal length
-        num_chunks = 20
-        chunk_length = (len(full_text) - 1) // num_chunks + 1
-        chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)]
-        # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned
-        tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten()
-        # Drop padding tokens
-        tokens = tokens[tokens != tokenizer.pad_token_id]
-        num_tokens = len(tokens)
-
-        # Handle cases where num_tokens is less than seq_len
-        if num_tokens < seq_len:
-            num_batches = 1
-            # Pad tokens if necessary. Use eos_token_id if the model has no pad token.
-            tokens = tokens[:seq_len]
-            if len(tokens) < seq_len:
-                padding_length = seq_len - len(tokens)
-                padding_id = tokenizer.eos_token_id if not has_pad_token else tokenizer.pad_token_id
-                padding = np.full(padding_length, padding_id)
-                tokens = np.concatenate([tokens, padding], axis=0)
-        else:
-            num_batches = num_tokens // seq_len
-            # Drop the final tokens if not enough to make a full sequence
-            tokens = tokens[: seq_len * num_batches]
-
     # Suppress the "sequence length longer than maximum" warning during chunked tokenization.
     _deprecation_warnings_saved = None
-    if hasattr(tokenizer, "deprecation_warnings") and isinstance(
-        tokenizer.deprecation_warnings, dict
-    ):
+    if hasattr(tokenizer, "deprecation_warnings"):
         _deprecation_warnings_saved = tokenizer.deprecation_warnings.copy()
-        tokenizer.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = (
-            False
-        )
+        tokenizer.deprecation_warnings[
+            "sequence-length-is-longer-than-the-specified-maximum"
+        ] = False
     try:
         # Define the length to chop things up into - leaving space for a bos_token if required
         if add_bos_token:
@@ -425,7 +381,9 @@ def tokenize_function(examples: Any) -> dict[str, np.ndarray]:
             # Divide into 20 chunks of ~ equal length
             num_chunks = 20
             chunk_length = (len(full_text) - 1) // num_chunks + 1
-            chunks = [full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)]
+            chunks = [
+                full_text[i * chunk_length : (i + 1) * chunk_length] for i in range(num_chunks)
+            ]
             # Tokenize the chunks in parallel. Uses NumPy because HuggingFace map doesn't want tensors returned
             tokens = tokenizer(chunks, return_tensors="np", padding=True)["input_ids"].flatten()
             # Drop padding tokens
@@ -439,7 +397,10 @@ def tokenize_function(examples: Any) -> dict[str, np.ndarray]:
                 tokens = tokens[:seq_len]
                 if len(tokens) < seq_len:
                     padding_length = seq_len - len(tokens)
-                    padding = np.full(padding_length, tokenizer.pad_token_id)
+                    padding_id = (
+                        tokenizer.eos_token_id if not has_pad_token else tokenizer.pad_token_id
+                    )
+                    padding = np.full(padding_length, padding_id)
                     tokens = np.concatenate([tokens, padding], axis=0)
             else:
                 num_batches = num_tokens // seq_len
@@ -535,7 +496,9 @@ def sample_logits(
 SliceInput = Optional[
     Union[
         int,
-        Tuple[int,],
+        Tuple[
+            int,
+        ],
         Tuple[int, int],
         Tuple[int, int, int],
         List[int],

From 86a94679c6eb79663ca3c1488005d2dee1abffdd Mon Sep 17 00:00:00 2001
From: Evelyn Yen <evelyn.chin179@gmail.com>
Date: Thu, 19 Feb 2026 17:25:27 -0500
Subject: [PATCH 5/5] fix formatting

---
 transformer_lens/HookedTransformer.py | 7 ++++++-
 transformer_lens/utils.py             | 4 +---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/transformer_lens/HookedTransformer.py b/transformer_lens/HookedTransformer.py
index f69f96e4f..e39a02eea 100644
--- a/transformer_lens/HookedTransformer.py
+++ b/transformer_lens/HookedTransformer.py
@@ -583,7 +583,12 @@ def forward(
             self, prepend_bos=prepend_bos, padding_side=padding_side
         ):
             if start_at_layer is None:
-                (residual, tokens, shortformer_pos_embed, attention_mask,) = self.input_to_embed(
+                (
+                    residual,
+                    tokens,
+                    shortformer_pos_embed,
+                    attention_mask,
+                ) = self.input_to_embed(
                     input,
                     prepend_bos=prepend_bos,
                     padding_side=padding_side,
diff --git a/transformer_lens/utils.py b/transformer_lens/utils.py
index 7d5712ea0..414a6b99f 100644
--- a/transformer_lens/utils.py
+++ b/transformer_lens/utils.py
@@ -496,9 +496,7 @@ def sample_logits(
 SliceInput = Optional[
     Union[
         int,
-        Tuple[
-            int,
-        ],
+        Tuple[int,],
         Tuple[int, int],
         Tuple[int, int, int],
         List[int],