diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 6a45f94a6..0b26762b0 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -374,8 +374,7 @@ exotic_models_job = CircleCIJob( "pip install 'git+https://github.com/facebookresearch/detectron2.git'", "sudo apt install tesseract-ocr", "pip install pytesseract", - # wait until natten is ready for torch 2.0.0 - # "pip install natten", + "pip install natten", ], tests_to_run=[ "tests/models/*layoutlmv*", diff --git a/setup.py b/setup.py index 943bb196b..c28387a3d 100644 --- a/setup.py +++ b/setup.py @@ -129,7 +129,7 @@ _deps = [ "keras-nlp>=0.3.1", "librosa", "nltk", - "natten>=0.14.4", + "natten>=0.14.5", "numpy>=1.17", "onnxconverter-common", "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 79f9118ae..aa638a6a9 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -35,7 +35,7 @@ deps = { "keras-nlp": "keras-nlp>=0.3.1", "librosa": "librosa", "nltk": "nltk", - "natten": "natten>=0.14.4", + "natten": "natten>=0.14.5", "numpy": "numpy>=1.17", "onnxconverter-common": "onnxconverter-common", "onnxruntime-tools": "onnxruntime-tools>=1.4.2", diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py index 95191d52b..efeb68846 100644 --- a/src/transformers/models/dinat/modeling_dinat.py +++ b/src/transformers/models/dinat/modeling_dinat.py @@ -347,7 +347,7 @@ class NeighborhoodAttention(nn.Module): query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1) diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py index 4b34fe730..3a93b81e4 100644 --- a/src/transformers/models/nat/modeling_nat.py +++ b/src/transformers/models/nat/modeling_nat.py @@ -339,7 +339,7 @@ class NeighborhoodAttention(nn.Module): query_layer = query_layer / math.sqrt(self.attention_head_size) # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases. - attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1) + attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1) # Normalize the attention scores to probabilities. attention_probs = nn.functional.softmax(attention_scores, dim=-1)