Explaining Transformer-based Semantic Similarity via Contextualized Best Matching Token Pairs
Brito & Iser, SIGIR 2023
pip install git+https://github.com/ebritoc/MaxSimE.gitMaxSimE is model-agnostic: it works with any transformer that produces token-level embeddings (ColBERT, SentenceTransformers, PyLate, etc.).
import numpy as np
from maxsime import MaxSimExplainer, plot_similarity_heatmap
# Initialize explainer
explainer = MaxSimExplainer()
# Get token embeddings from your model
# (example with SentenceTransformer)
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
query = "What is the CET1 ratio?"
doc = "The Common Equity Tier 1 ratio stands at 13.9%."
# Encode with token-level output
query_emb = model.encode(query, output_value="token_embeddings")
doc_emb = model.encode(doc, output_value="token_embeddings")
# Get tokens
query_tokens = model.tokenizer.convert_ids_to_tokens(
model.tokenizer.encode(query)
)
doc_tokens = model.tokenizer.convert_ids_to_tokens(
model.tokenizer.encode(doc)
)
# Compute explanation
explanation = explainer.explain(
query_emb, doc_emb,
query_tokens, doc_tokens,
)
# View top matches
print("Top token alignments:")
for pair in explanation.top_k(5):
print(f" {pair}")
# Visualize
fig = plot_similarity_heatmap(explanation)
fig.savefig("maxsime_heatmap.png")
print("Saved heatmap to maxsime_heatmap.png")from maxsime import MaxSimExplainer
explainer = MaxSimExplainer(special_tokens=None)
explanation = explainer.explain(
query_embeddings, # np.ndarray, shape (Q, dim)
doc_embeddings, # np.ndarray, shape (D, dim)
query_tokens, # list[str]
doc_tokens, # list[str]
filter_special_tokens=True,
)explanation.query_tokens # list[str]
explanation.doc_tokens # list[str]
explanation.alignment # list[TokenPair] - MaxSim matches
explanation.total_score # float - sum of MaxSim scores
explanation.similarity_matrix # np.ndarray - full (Q x D) cosine sim matrix
explanation.top_k(k=10) # Top-k pairs by similarity
explanation.top_pairs # All pairs sorted by similarity
explanation.matched_doc_indices # set of doc indices that were matchedfrom maxsime import plot_similarity_heatmap, plot_token_alignment
# Heatmap of similarity matrix
fig = plot_similarity_heatmap(
explanation,
title="My Explanation",
highlight_alignment=True,
figsize=(14, 6),
)
# Bar chart of top alignments
fig = plot_token_alignment(
explanation,
top_k=10,
)
# Save directly
from maxsime import save_explanation_figure
save_explanation_figure(explanation, "output.png", plot_type="heatmap")The original reproduction code is preserved in the legacy/ directory.
See legacy/README.md for instructions on reproducing the exact paper results using the LoTTE benchmark.
Core package (automatically installed):
numpy>=1.21matplotlib>=3.5
No PyTorch, ColBERT, or GPU required for the core package.
For development:
pip install -e ".[dev]"
pytest tests/If you use this code, please cite the original SIGIR 2023 paper:
@inproceedings{10.1145/3539618.3592017,
author = {Brito, Eduardo and Iser, Henri},
title = {MaxSimE: Explaining Transformer-based Semantic Similarity via Contextualized Best Matching Token Pairs},
year = {2023},
isbn = {9781450394086},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3539618.3592017},
doi = {10.1145/3539618.3592017},
booktitle = {Proceedings of the 46th International ACM SIGIR Conference on Research and Development in Information Retrieval},
pages = {2154--2158},
location = {Taipei, Taiwan},
series = {SIGIR '23}
}MIT License - see LICENSE for details.