diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py
index 6a45f94a6..0b26762b0 100644
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@@ -374,8 +374,7 @@ exotic_models_job = CircleCIJob(
         "pip install 'git+https://github.com/facebookresearch/detectron2.git'",
         "sudo apt install tesseract-ocr",
         "pip install pytesseract",
-        # wait until natten is ready for torch 2.0.0
-        # "pip install natten",
+        "pip install natten",
     ],
     tests_to_run=[
         "tests/models/*layoutlmv*",
diff --git a/setup.py b/setup.py
index 943bb196b..c28387a3d 100644
--- a/setup.py
+++ b/setup.py
@@ -129,7 +129,7 @@ _deps = [
     "keras-nlp>=0.3.1",
     "librosa",
     "nltk",
-    "natten>=0.14.4",
+    "natten>=0.14.5",
     "numpy>=1.17",
     "onnxconverter-common",
     "onnxruntime-tools>=1.4.2",
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 79f9118ae..aa638a6a9 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -35,7 +35,7 @@ deps = {
     "keras-nlp": "keras-nlp>=0.3.1",
     "librosa": "librosa",
     "nltk": "nltk",
-    "natten": "natten>=0.14.4",
+    "natten": "natten>=0.14.5",
     "numpy": "numpy>=1.17",
     "onnxconverter-common": "onnxconverter-common",
     "onnxruntime-tools": "onnxruntime-tools>=1.4.2",
diff --git a/src/transformers/models/dinat/modeling_dinat.py b/src/transformers/models/dinat/modeling_dinat.py
index 95191d52b..efeb68846 100644
--- a/src/transformers/models/dinat/modeling_dinat.py
+++ b/src/transformers/models/dinat/modeling_dinat.py
@@ -347,7 +347,7 @@ class NeighborhoodAttention(nn.Module):
         query_layer = query_layer / math.sqrt(self.attention_head_size)
 
         # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.dilation)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, dim=-1)
diff --git a/src/transformers/models/nat/modeling_nat.py b/src/transformers/models/nat/modeling_nat.py
index 4b34fe730..3a93b81e4 100644
--- a/src/transformers/models/nat/modeling_nat.py
+++ b/src/transformers/models/nat/modeling_nat.py
@@ -339,7 +339,7 @@ class NeighborhoodAttention(nn.Module):
         query_layer = query_layer / math.sqrt(self.attention_head_size)
 
         # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
-        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, 1)
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
 
         # Normalize the attention scores to probabilities.
         attention_probs = nn.functional.softmax(attention_scores, dim=-1)