Fix natten (#22229)

* Add kernel size to NATTEN's QK arguments. The new NATTEN 0.14.5 supports PyTorch 2.0, but also adds an additional argument to the QK operation to allow optional RPBs. This ends up failing NATTEN tests. This commit adds NATTEN back to circleci and adds the arguments to get it working again. * Force NATTEN >= 0.14.5
2026-05-14 20:58:08 +00:00 · 2023-03-17 11:07:55 -04:00 · 2023-03-17 11:07:55 -04:00 · 3028b20a71
commit 3028b20a71
parent 074490b2c2
5 changed files with 5 additions and 6 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -374,8 +374,7 @@ exotic_models_job = CircleCIJob(
        "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
        "sudo apt install tesseract-ocr",
        "pip install pytesseract",
-        # wait until natten is ready for torch 2.0.0
-        # "pip install natten",
+        "pip install natten",
    ],
    tests_to_run=[
        "tests/models/*layoutlmv*",
--- a/setup.py
+++ b/setup.py
@ -129,7 +129,7 @@ _deps = [
    "keras-nlp>=0.3.1",
    "librosa",
    "nltk",
-    "natten>=0.14.4",
+    "natten>=0.14.5",
    "numpy>=1.17",
    "onnxconverter-common",
    "onnxruntime-tools>=1.4.2",
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -35,7 +35,7 @@ deps = {
    "keras-nlp": "keras-nlp>=0.3.1",
    "librosa": "librosa",
    "nltk": "nltk",
-    "natten": "natten>=0.14.4",
+    "natten": "natten>=0.14.5",
    "numpy": "numpy>=1.17",
    "onnxconverter-common": "onnxconverter-common",
    "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@ -347,7 +347,7 @@ class NeighborhoodAttention(nn.Module):
        query_layer = query_layer / math.sqrt(self.attention_head_size)

        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@ -339,7 +339,7 @@ class NeighborhoodAttention(nn.Module):
        query_layer = query_layer / math.sqrt(self.attention_head_size)

        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)

        # Normalize the attention scores to probabilities.
        attention_probs = nn.functional.softmax(attention_scores, dim=-1)