From cee98c903bbeccfffe5636c6fdfb4805edcaa1fc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 24 May 2021 21:08:47 -0700
Subject: [PATCH 01/10] Internal change

PiperOrigin-RevId: 375621932
---
 official/nlp/finetuning/superglue/run_superglue.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/official/nlp/finetuning/superglue/run_superglue.py b/official/nlp/finetuning/superglue/run_superglue.py
index bac41e0a129..8797e16e476 100644
--- a/official/nlp/finetuning/superglue/run_superglue.py
+++ b/official/nlp/finetuning/superglue/run_superglue.py
@@ -27,6 +27,7 @@
 
 from official.common import distribute_utils
 # Imports registered experiment configs.
+from official.common import registry_imports  # pylint: disable=unused-import
 from official.core import exp_factory
 from official.core import task_factory
 from official.core import train_lib

From 3d03e675d444186a49a665ce2e1be32a24c59215 Mon Sep 17 00:00:00 2001
From: Yeqing Li <yeqing@google.com>
Date: Tue, 25 May 2021 09:41:10 -0700
Subject: [PATCH 02/10] Updates default learning rate for UCF-101 experiment.

PiperOrigin-RevId: 375725859
---
 official/vision/beta/configs/video_classification.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/official/vision/beta/configs/video_classification.py b/official/vision/beta/configs/video_classification.py
index b6ede36172e..d6d3c9499a1 100644
--- a/official/vision/beta/configs/video_classification.py
+++ b/official/vision/beta/configs/video_classification.py
@@ -254,7 +254,12 @@ def video_classification_ucf101() -> cfg.ExperimentConfig:
           'task.validation_data.is_training != None',
           'task.train_data.num_classes == task.validation_data.num_classes',
       ])
-  add_trainer(config, train_batch_size=64, eval_batch_size=16, train_epochs=100)
+  add_trainer(
+      config,
+      train_batch_size=64,
+      eval_batch_size=16,
+      learning_rate=0.8,
+      train_epochs=100)
   return config
 
 

From cda3bca5d53b6a09d8c0a3e2952feba297cbc096 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 May 2021 09:58:42 -0700
Subject: [PATCH 03/10] Updating READMEs for DLRM Model.

PiperOrigin-RevId: 375729667
---
 official/README-TPU.md                    |  3 +++
 official/recommendation/ranking/README.md | 12 ++++++------
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/official/README-TPU.md b/official/README-TPU.md
index 28a5a0a73d2..a6031c44f03 100644
--- a/official/README-TPU.md
+++ b/official/README-TPU.md
@@ -26,4 +26,7 @@
 *   [shapemask](vision/detection): An object detection and instance segmentation model using shape priors. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/ZbXgVoc6Rf6mBRlPj0JpLA).
 
 ## Recommendation
+*   [dlrm](recommendation/ranking): [Deep Learning Recommendation Model for
+Personalization and Recommendation Systems](https://arxiv.org/abs/1906.00091).
+*   [dcn v2](recommendation/ranking): [Improved Deep & Cross Network and Practical Lessons for Web-scale Learning to Rank Systems](https://arxiv.org/abs/2008.13535).
 *   [ncf](recommendation): Neural Collaborative Filtering. See [Tensorboard.dev training metrics](https://tensorboard.dev/experiment/0k3gKjZlR1ewkVTRyLB6IQ).
diff --git a/official/recommendation/ranking/README.md b/official/recommendation/ranking/README.md
index 1d42b4f278d..dff361a2378 100644
--- a/official/recommendation/ranking/README.md
+++ b/official/recommendation/ranking/README.md
@@ -16,8 +16,8 @@ When training on TPUs we use
 [TPUEmbedding layer](https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/layers/embedding/tpu_embedding_layer.py)
 for categorical features. TPU embedding supports large embedding tables with
 fast lookup, the size of embedding tables scales linearly with the size of TPU
-pod. We can have up to 96 GB embedding tables for TPU v3-8 and 6.14 TB for
-v3-512 and 24.6 TB for TPU Pod v3-2048.
+pod. We can have up to 90 GB embedding tables for TPU v3-8 and 5.6 TB for
+v3-512 and 22,4 TB for TPU Pod v3-2048.
 
 The Model code is in
 [TensorFlow Recommenders](https://github.com/tensorflow/recommenders/tree/main/tensorflow_recommenders/experimental/models)
@@ -30,7 +30,7 @@ Recommenders](https://www.tensorflow.org/recommenders) library.
 
 ```bash
 git clone https://github.com/tensorflow/models.git
-pip install -r models/official/requirements.txt
+pip install tensorflow-recommenders
 export PYTHONPATH=$PYTHONPATH:$(pwd)/models
 ```
 
@@ -98,10 +98,10 @@ export EXPERIMENT_NAME=my_experiment_name
 export BUCKET_NAME="gs://my_dlrm_bucket"
 export DATA_DIR="${BUCKET_NAME}/data"
 
-python3 official/recommendation/ranking/main.py --mode=train_and_eval \
+python3 models/official/recommendation/ranking/train.py --mode=train_and_eval \
 --model_dir=${BUCKET_NAME}/model_dirs/${EXPERIMENT_NAME} --params_override="
 runtime:
-    distribution_strategy='tpu'
+    distribution_strategy: 'tpu'
 task:
     use_synthetic_data: false
     train_data:
@@ -125,7 +125,7 @@ trainer:
     checkpoint_interval: 100000
     validation_steps: 5440
     train_steps: 256054
-    steps_per_execution: 1000
+    steps_per_loop: 1000
 "
 ```
 

From 50905fd236c25fcb4841528cc4f05f8e35a63836 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 May 2021 16:56:09 -0700
Subject: [PATCH 04/10] Removing tensorflow recommenders library from
 requirement.txt.

PiperOrigin-RevId: 375826079
---
 official/recommendation/ranking/README.md   | 22 +++++++++++++++++----
 official/requirements.txt                   |  1 -
 official/utils/testing/scripts/presubmit.sh |  4 ++--
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/official/recommendation/ranking/README.md b/official/recommendation/ranking/README.md
index dff361a2378..9c2ca21039f 100644
--- a/official/recommendation/ranking/README.md
+++ b/official/recommendation/ranking/README.md
@@ -25,16 +25,30 @@ library, while input pipeline, configuration and training loop is here.
 
 ## Prerequisites
 To get started, download the code from TensorFlow models GitHub repository or
-use the pre-installed Google Cloud VM. We also need to install [TensorFlow
-Recommenders](https://www.tensorflow.org/recommenders) library.
+use the pre-installed Google Cloud VM.
 
 ```bash
 git clone https://github.com/tensorflow/models.git
-pip install tensorflow-recommenders
 export PYTHONPATH=$PYTHONPATH:$(pwd)/models
 ```
 
-Make sure to use TensorFlow 2.4+.
+We also need to install
+[TensorFlow Recommenders](https://www.tensorflow.org/recommenders) library.
+If you are using [tf-nightly](https://pypi.org/project/tf-nightly/) make
+sure to install
+[tensorflow-recommenders](https://pypi.org/project/tensorflow-recommenders/)
+without its dependancies by passing `--no-deps` argument.
+
+For tf-nightly:
+```bash
+pip install tensorflow-recommenders --no-deps
+```
+
+For stable TensorFlow 2.4+ [releases](https://pypi.org/project/tensorflow/):
+```bash
+pip install tensorflow-recommenders
+```
+
 
 ## Dataset
 
diff --git a/official/requirements.txt b/official/requirements.txt
index 0c734c580b0..74028adcb55 100644
--- a/official/requirements.txt
+++ b/official/requirements.txt
@@ -12,7 +12,6 @@ tensorflow-hub>=0.6.0
 tensorflow-model-optimization>=0.4.1
 tensorflow-datasets
 tensorflow-addons
-tensorflow-recommenders>=0.5.0
 dataclasses;python_version<"3.7"
 gin-config
 tf_slim>=1.1.0
diff --git a/official/utils/testing/scripts/presubmit.sh b/official/utils/testing/scripts/presubmit.sh
index 954d96df7f8..33eca3cbb41 100755
--- a/official/utils/testing/scripts/presubmit.sh
+++ b/official/utils/testing/scripts/presubmit.sh
@@ -31,8 +31,8 @@ py_test() {
   local exit_code=0
 
   echo "===========Running Python test============"
-
-  for test_file in `find official/ -name '*test.py' -print`
+  # Skipping Ranking tests, TODO(b/189265753) remove it once the issue is fixed.
+  for test_file in `find official/ -name '*test.py' -print | grep -v 'official/recommendation/ranking'`
   do
     echo "####=======Testing ${test_file}=======####"
     ${PY_BINARY} "${test_file}"

From fda53f7875bfe56d00fe0faeb0064ea8fc5ecf60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 25 May 2021 20:25:28 -0700
Subject: [PATCH 05/10] Internal change

PiperOrigin-RevId: 375854504
---
 official/nlp/data/classifier_data_lib.py      | 86 +++++++++++++++++++
 official/nlp/data/create_finetuning_data.py   |  8 +-
 .../nlp/finetuning/superglue/run_superglue.py |  6 +-
 3 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/official/nlp/data/classifier_data_lib.py b/official/nlp/data/classifier_data_lib.py
index 222485a9f4f..168a2ae4390 100644
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -1316,6 +1316,92 @@ def _create_examples(self, lines, set_type):
     return examples
 
 
+class BoolQProcessor(DataProcessor):
+  """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["True", "False"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "BoolQ"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training/dev/test sets."""
+    examples = []
+    for line in lines:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
+      text_a = self.process_text_fn(line["question"])
+      text_b = self.process_text_fn(line["passage"])
+      if set_type == "test":
+        label = "False"
+      else:
+        label = str(line["label"])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+class CBProcessor(DataProcessor):
+  """Processor for the CB dataset (SuperGLUE diagnostics dataset)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["entailment", "neutral", "contradiction"]
+
+  @staticmethod
+  def get_processor_name():
+    """See base class."""
+    return "CB"
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training/dev/test sets."""
+    examples = []
+    for line in lines:
+      guid = "%s-%s" % (set_type, self.process_text_fn(str(line["idx"])))
+      text_a = self.process_text_fn(line["premise"])
+      text_b = self.process_text_fn(line["hypothesis"])
+      if set_type == "test":
+        label = "entailment"
+      else:
+        label = self.process_text_fn(line["label"])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
 class SuperGLUERTEProcessor(DataProcessor):
   """Processor for the RTE dataset (SuperGLUE version)."""
 
diff --git a/official/nlp/data/create_finetuning_data.py b/official/nlp/data/create_finetuning_data.py
index 14b2bbc0463..9d31c9a5000 100644
--- a/official/nlp/data/create_finetuning_data.py
+++ b/official/nlp/data/create_finetuning_data.py
@@ -50,7 +50,7 @@
     "classification_task_name", "MNLI", [
         "AX", "COLA", "IMDB", "MNLI", "MRPC", "PAWS-X", "QNLI", "QQP", "RTE",
         "SST-2", "STS-B", "WNLI", "XNLI", "XTREME-XNLI", "XTREME-PAWS-X",
-        "AX-g", "SUPERGLUE-RTE"
+        "AX-g", "SUPERGLUE-RTE", "CB", "BoolQ"
     ], "The name of the task to train BERT classifier. The "
     "difference between XTREME-XNLI and XNLI is: 1. the format "
     "of input tsv files; 2. the dev set for XTREME is english "
@@ -243,7 +243,11 @@ def generate_classifier_dataset():
         "ax-g":
             classifier_data_lib.AXgProcessor,
         "superglue-rte":
-            classifier_data_lib.SuperGLUERTEProcessor
+            classifier_data_lib.SuperGLUERTEProcessor,
+        "cb":
+            classifier_data_lib.CBProcessor,
+        "boolq":
+            classifier_data_lib.BoolQProcessor,
     }
     task_name = FLAGS.classification_task_name.lower()
     if task_name not in processors:
diff --git a/official/nlp/finetuning/superglue/run_superglue.py b/official/nlp/finetuning/superglue/run_superglue.py
index 8797e16e476..01025a88f93 100644
--- a/official/nlp/finetuning/superglue/run_superglue.py
+++ b/official/nlp/finetuning/superglue/run_superglue.py
@@ -65,6 +65,8 @@
 
 AXG_CLASS_NAMES = ['entailment', 'not_entailment']
 RTE_CLASS_NAMES = ['entailment', 'not_entailment']
+CB_CLASS_NAMES = ['entailment', 'neutral', 'contradiction']
+BOOLQ_CLASS_NAMES = ['True', 'False']
 
 
 def _override_exp_config_by_file(exp_config, exp_config_files):
@@ -154,7 +156,9 @@ def _write_submission_file(task, seq_length):
   write_fn = binary_helper.write_superglue_classification
   write_fn_map = {
       'RTE': functools.partial(write_fn, class_names=RTE_CLASS_NAMES),
-      'AX-g': functools.partial(write_fn, class_names=AXG_CLASS_NAMES)
+      'AX-g': functools.partial(write_fn, class_names=AXG_CLASS_NAMES),
+      'CB': functools.partial(write_fn, class_names=CB_CLASS_NAMES),
+      'BoolQ': functools.partial(write_fn, class_names=BOOLQ_CLASS_NAMES)
   }
   logging.info('Predicting %s', FLAGS.test_input_path)
   write_fn_map[FLAGS.task_name](

From 081bee8c681eaf87b89c2ca8b63c0e913f42da95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 26 May 2021 10:45:03 -0700
Subject: [PATCH 06/10] Internal change

PiperOrigin-RevId: 375979776
---
 official/nlp/data/classifier_data_lib.py | 46 +++++++-----------------
 1 file changed, 12 insertions(+), 34 deletions(-)

diff --git a/official/nlp/data/classifier_data_lib.py b/official/nlp/data/classifier_data_lib.py
index 168a2ae4390..2498c327094 100644
--- a/official/nlp/data/classifier_data_lib.py
+++ b/official/nlp/data/classifier_data_lib.py
@@ -1316,8 +1316,8 @@ def _create_examples(self, lines, set_type):
     return examples
 
 
-class BoolQProcessor(DataProcessor):
-  """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset)."""
+class SuperGLUEDataProcessor(DataProcessor):
+  """Processor for the SuperGLUE dataset."""
 
   def get_train_examples(self, data_dir):
     """See base class."""
@@ -1334,6 +1334,14 @@ def get_test_examples(self, data_dir):
     return self._create_examples(
         self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
 
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training/dev/test sets."""
+    raise NotImplementedError()
+
+
+class BoolQProcessor(SuperGLUEDataProcessor):
+  """Processor for the BoolQ dataset (SuperGLUE diagnostics dataset)."""
+
   def get_labels(self):
     """See base class."""
     return ["True", "False"]
@@ -1359,24 +1367,9 @@ def _create_examples(self, lines, set_type):
     return examples
 
 
-class CBProcessor(DataProcessor):
+class CBProcessor(SuperGLUEDataProcessor):
   """Processor for the CB dataset (SuperGLUE diagnostics dataset)."""
 
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
-
   def get_labels(self):
     """See base class."""
     return ["entailment", "neutral", "contradiction"]
@@ -1402,24 +1395,9 @@ def _create_examples(self, lines, set_type):
     return examples
 
 
-class SuperGLUERTEProcessor(DataProcessor):
+class SuperGLUERTEProcessor(SuperGLUEDataProcessor):
   """Processor for the RTE dataset (SuperGLUE version)."""
 
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "train.jsonl")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "val.jsonl")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_jsonl(os.path.join(data_dir, "test.jsonl")), "test")
-
   def get_labels(self):
     """See base class."""
     # All datasets are converted to 2-class split, where for 3-class datasets we

From 27be57eba9566e856fd8ebfa008212b91ff92000 Mon Sep 17 00:00:00 2001
From: Abdullah Rashwan <arashwan@google.com>
Date: Wed, 26 May 2021 14:41:24 -0700
Subject: [PATCH 07/10] Internal change

PiperOrigin-RevId: 376032565
---
 .../optimization/configs/optimizer_config.py     |  4 ++++
 official/modeling/optimization/ema_optimizer.py  | 16 +++++++++++++---
 .../k400_resnet3drs_50_tpu.yaml                  |  1 +
 .../vision/beta/configs/image_classification.py  |  3 ++-
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/official/modeling/optimization/configs/optimizer_config.py b/official/modeling/optimization/configs/optimizer_config.py
index 37f9db50f59..7b4de948248 100644
--- a/official/modeling/optimization/configs/optimizer_config.py
+++ b/official/modeling/optimization/configs/optimizer_config.py
@@ -180,11 +180,15 @@ class EMAConfig(BaseOptimizerConfig):
 
   Attributes:
     name: 'str', name of the optimizer.
+    trainable_weights_only: 'bool', if True, only model trainable weights will
+      be updated. Otherwise, all model weights will be updated. This mainly
+      affects batch normalization parameters.
     average_decay: 'float', average decay value.
     start_step: 'int', start step to apply moving average.
     dynamic_decay: 'bool', whether to apply dynamic decay or not.
   """
   name: str = "ExponentialMovingAverage"
+  trainable_weights_only: bool = True
   average_decay: float = 0.99
   start_step: int = 0
   dynamic_decay: bool = True
diff --git a/official/modeling/optimization/ema_optimizer.py b/official/modeling/optimization/ema_optimizer.py
index 5c746ad7d1a..3bf3c3607df 100644
--- a/official/modeling/optimization/ema_optimizer.py
+++ b/official/modeling/optimization/ema_optimizer.py
@@ -48,6 +48,7 @@ class ExponentialMovingAverage(tf.keras.optimizers.Optimizer):
 
   def __init__(self,
                optimizer: tf.keras.optimizers.Optimizer,
+               trainable_weights_only: bool = True,
                average_decay: float = 0.99,
                start_step: int = 0,
                dynamic_decay: bool = True,
@@ -58,6 +59,9 @@ def __init__(self,
     Args:
       optimizer: `tf.keras.optimizers.Optimizer` that will be
         used to compute and apply gradients.
+      trainable_weights_only: 'bool', if True, only model trainable weights will
+        be updated. Otherwise, all model weights will be updated. This mainly
+        affects batch normalization parameters.
       average_decay: float. Decay to use to maintain the moving averages
         of trained variables.
       start_step: int. What step to start the moving average.
@@ -72,6 +76,7 @@ def __init__(self,
     """
     super().__init__(name, **kwargs)
     self._average_decay = average_decay
+    self._trainable_weights_only = trainable_weights_only
     self._start_step = tf.constant(start_step, tf.float32)
     self._dynamic_decay = dynamic_decay
     self._optimizer = optimizer
@@ -81,12 +86,17 @@ def __init__(self,
 
   def shadow_copy(self, model: tf.keras.Model):
     """Creates shadow variables for the given model weights."""
-    for var in model.weights:
+
+    if self._trainable_weights_only:
+      self._model_weights = model.trainable_variables
+    else:
+      self._model_weights = model.variables
+    for var in self._model_weights:
       self.add_slot(var, 'average', initializer='zeros')
+
     self._average_weights = [
-        self.get_slot(var, 'average') for var in model.weights
+        self.get_slot(var, 'average') for var in self._model_weights
     ]
-    self._model_weights = model.weights
 
   @property
   def has_shadow_copy(self):
diff --git a/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml b/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml
index 3d68f539601..83875d1273a 100644
--- a/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml
+++ b/official/vision/beta/configs/experiments/video_classification/k400_resnet3drs_50_tpu.yaml
@@ -80,6 +80,7 @@ trainer:
   optimizer_config:
     ema:
       average_decay: 0.9999
+      trainable_weights_only: false
     learning_rate:
       cosine:
         decay_steps: 73682
diff --git a/official/vision/beta/configs/image_classification.py b/official/vision/beta/configs/image_classification.py
index e80c85f87fd..7044a4c0004 100644
--- a/official/vision/beta/configs/image_classification.py
+++ b/official/vision/beta/configs/image_classification.py
@@ -227,7 +227,8 @@ def image_classification_imagenet_resnetrs() -> cfg.ExperimentConfig:
                   }
               },
               'ema': {
-                  'average_decay': 0.9999
+                  'average_decay': 0.9999,
+                  'trainable_weights_only': False,
               },
               'learning_rate': {
                   'type': 'cosine',

From 63719f08ae3073dede98d0722a096f77891aa965 Mon Sep 17 00:00:00 2001
From: Anirudh Vegesana <anirudh.vegesana@gmail.com>
Date: Mon, 24 May 2021 20:03:15 -0400
Subject: [PATCH 08/10] YOLO Family: Updated model (#9923)

* Update YOLO model

* Fix some docstrings

* Fix docstrings

* Address some of Dr. Davis' changes

* Give descriptive names to the test cases

* Fix bugs

* Fix YOLO head imports

* docstring and variable name updates

* docstring and variable name updates

* docstring and variable name updates

Co-authored-by: vishnubanna <banna3vishnu@gmail.com>
Co-authored-by: Vishnu Banna <43182884+vishnubanna@users.noreply.github.com>
---
 .../beta/projects/yolo/configs/backbones.py   |    6 +-
 .../yolo/configs/darknet_classification.py    |    2 +-
 .../yolo/modeling/backbones/darknet.py        |  608 +++++--
 .../yolo/modeling/backbones/darknet_test.py   |   71 +-
 .../yolo/modeling/decoders/__init__.py        |    0
 .../yolo/modeling/decoders/yolo_decoder.py    |  484 ++++++
 .../modeling/decoders/yolo_decoder_test.py    |  154 ++
 .../projects/yolo/modeling/heads/__init__.py  |    0
 .../projects/yolo/modeling/heads/yolo_head.py |  120 ++
 .../yolo/modeling/heads/yolo_head_test.py     |   77 +
 .../yolo/modeling/layers/nn_blocks.py         | 1500 +++++++++++++----
 .../yolo/modeling/layers/nn_blocks_test.py    |  201 ++-
 12 files changed, 2590 insertions(+), 633 deletions(-)
 create mode 100644 official/vision/beta/projects/yolo/modeling/decoders/__init__.py
 create mode 100644 official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
 create mode 100644 official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
 create mode 100644 official/vision/beta/projects/yolo/modeling/heads/__init__.py
 create mode 100644 official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
 create mode 100644 official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py

diff --git a/official/vision/beta/projects/yolo/configs/backbones.py b/official/vision/beta/projects/yolo/configs/backbones.py
index a79cb09e17e..a99141543e1 100644
--- a/official/vision/beta/projects/yolo/configs/backbones.py
+++ b/official/vision/beta/projects/yolo/configs/backbones.py
@@ -24,11 +24,11 @@
 
 
 @dataclasses.dataclass
-class DarkNet(hyperparams.Config):
-  """DarkNet config."""
+class Darknet(hyperparams.Config):
+  """Darknet config."""
   model_id: str = "darknet53"
 
 
 @dataclasses.dataclass
 class Backbone(backbones.Backbone):
-  darknet: DarkNet = DarkNet()
+  darknet: Darknet = Darknet()
diff --git a/official/vision/beta/projects/yolo/configs/darknet_classification.py b/official/vision/beta/projects/yolo/configs/darknet_classification.py
index b33e149d484..ffaf387fac0 100644
--- a/official/vision/beta/projects/yolo/configs/darknet_classification.py
+++ b/official/vision/beta/projects/yolo/configs/darknet_classification.py
@@ -32,7 +32,7 @@ class ImageClassificationModel(hyperparams.Config):
   num_classes: int = 0
   input_size: List[int] = dataclasses.field(default_factory=list)
   backbone: backbones.Backbone = backbones.Backbone(
-      type='darknet', resnet=backbones.DarkNet())
+      type='darknet', darknet=backbones.Darknet())
   dropout_rate: float = 0.0
   norm_activation: common.NormActivation = common.NormActivation()
   # Adds a BatchNormalization layer pre-GlobalAveragePooling in classification
diff --git a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
index 170c6bb7680..db00dfd5ad5 100644
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet.py
@@ -29,44 +29,51 @@
     arXiv:1911.11929
 
 
-DarkNets Are used mainly for Object detection in:
+Darknets are used mainly for object detection in:
 [1] Joseph Redmon, Ali Farhadi
     YOLOv3: An Incremental Improvement. arXiv:1804.02767
 
 [2] Alexey Bochkovskiy, Chien-Yao Wang, Hong-Yuan Mark Liao
     YOLOv4: Optimal Speed and Accuracy of Object Detection. arXiv:2004.10934
 """
-import collections
 
+import collections
 import tensorflow as tf
 
 from official.modeling import hyperparams
 from official.vision.beta.modeling.backbones import factory
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 
+# builder required classes
 
-class BlockConfig(object):
-  """Get layer config to make code more readable.
 
-    Args:
-        layer: string layer name
-        stack: the type of layer ordering to use for this specific level
-        repetitions: integer for the number of times to repeat block
-        bottelneck: boolean for does this stack have a bottle neck layer
-        filters: integer for the output depth of the level
-        pool_size: integer the pool_size of max pool layers
-        kernel_size: optional integer, for convolution kernel size
-        strides: integer or tuple to indicate convolution strides
-        padding: the padding to apply to layers in this stack
-        activation: string for the activation to use for this stack
-        route: integer for what level to route from to get the next input
-        output_name: the name to use for this output
-        is_output: is this layer an output in the default model
+class BlockConfig:
+  """
+  Class to store layer config to make code more readable
   """
 
   def __init__(self, layer, stack, reps, bottleneck, filters, pool_size,
-               kernel_size, strides, padding, activation, route, output_name,
-               is_output):
+               kernel_size, strides, padding, activation, route, dilation_rate,
+               output_name, is_output):
+    """
+    Args:
+      layer: A `str` for layer name.
+      stack: A `str` for the type of layer ordering to use for this specific
+        level.
+      reps: An `int` for the number of times to repeat block.
+      bottleneck: A `bool` for whether this stack has a bottle neck layer.
+      filters: An `int` for the output depth of the level.
+      pool_size: An `int` for the pool_size of max pool layers.
+      kernel_size: An `int` for convolution kernel size.
+      strides: A `Union[int, tuple]` that indicates convolution strides.
+      padding: An `int` for the padding to apply to layers in this stack.
+      activation: A `str` for the activation to use for this stack.
+      route: An `int` for the level to route from to get the next input.
+      dilation_rate: An `int` for the scale used in dialated Darknet.
+      output_name: A `str` for the name to use for this output.
+      is_output: A `bool` for whether this layer is an output in the default
+        model.
+    """
     self.layer = layer
     self.stack = stack
     self.repetitions = reps
@@ -78,6 +85,7 @@ def __init__(self, layer, stack, reps, bottleneck, filters, pool_size,
     self.padding = padding
     self.activation = activation
     self.route = route
+    self.dilation_rate = dilation_rate
     self.output_name = output_name
     self.is_output = is_output
 
@@ -89,41 +97,40 @@ def build_block_specs(config):
   return specs
 
 
-class LayerFactory(object):
-  """Class for quick look up of default layers.
-
-  Used by darknet to connect, introduce or exit a level. Used in place of an if
-  condition or switch to make adding new layers easier and to reduce redundant
-  code.
+class LayerBuilder:
+  """
+  class for quick look up of default layers used by darknet to
+  connect, introduce or exit a level. Used in place of an if condition
+  or switch to make adding new layers easier and to reduce redundant code
   """
 
   def __init__(self):
     self._layer_dict = {
-        "ConvBN": (nn_blocks.ConvBN, self.conv_bn_config_todict),
-        "MaxPool": (tf.keras.layers.MaxPool2D, self.maxpool_config_todict)
+        'ConvBN': (nn_blocks.ConvBN, self.conv_bn_config_todict),
+        'MaxPool': (tf.keras.layers.MaxPool2D, self.maxpool_config_todict)
     }
 
   def conv_bn_config_todict(self, config, kwargs):
     dictvals = {
-        "filters": config.filters,
-        "kernel_size": config.kernel_size,
-        "strides": config.strides,
-        "padding": config.padding
+        'filters': config.filters,
+        'kernel_size': config.kernel_size,
+        'strides': config.strides,
+        'padding': config.padding
     }
     dictvals.update(kwargs)
     return dictvals
 
   def darktiny_config_todict(self, config, kwargs):
-    dictvals = {"filters": config.filters, "strides": config.strides}
+    dictvals = {'filters': config.filters, 'strides': config.strides}
     dictvals.update(kwargs)
     return dictvals
 
   def maxpool_config_todict(self, config, kwargs):
     return {
-        "pool_size": config.pool_size,
-        "strides": config.strides,
-        "padding": config.padding,
-        "name": kwargs["name"]
+        'pool_size': config.pool_size,
+        'strides': config.strides,
+        'padding': config.padding,
+        'name': kwargs['name']
     }
 
   def __call__(self, config, kwargs):
@@ -134,90 +141,259 @@ def __call__(self, config, kwargs):
 
 # model configs
 LISTNAMES = [
-    "default_layer_name", "level_type", "number_of_layers_in_level",
-    "bottleneck", "filters", "kernal_size", "pool_size", "strides", "padding",
-    "default_activation", "route", "level/name", "is_output"
+    'default_layer_name', 'level_type', 'number_of_layers_in_level',
+    'bottleneck', 'filters', 'kernal_size', 'pool_size', 'strides', 'padding',
+    'default_activation', 'route', 'dilation', 'level/name', 'is_output'
 ]
 
-# pylint: disable=line-too-long
 CSPDARKNET53 = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 106,
-               "neck_split": 138},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 1, "same", "mish", -1, 0, False],
-        ["DarkRes", "csp", 1, True, 64, None, None, None, None, "mish", -1, 1, False],
-        ["DarkRes", "csp", 2, False, 128, None, None, None, None, "mish", -1, 2, False],
-        ["DarkRes", "csp", 8, False, 256, None, None, None, None, "mish", -1, 3, True],
-        ["DarkRes", "csp", 8, False, 512, None, None, None, None, "mish", -1, 4, True],
-        ["DarkRes", "csp", 4, False, 1024, None, None, None, None, "mish", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 106,
+        'neck_split': 132
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 2, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 256, None, None, None, None, 'mish', -1,
+            1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 512, None, None, None, None, 'mish', -1,
+            2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 4, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+    ]
+}
+
+CSPADARKNET53 = {
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 100,
+        'neck_split': 135
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'residual', 1, True, 64, None, None, None, None, 'mish',
+            -1, 1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 2, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 256, None, None, None, None, 'mish', -1,
+            1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 8, False, 512, None, None, None, None, 'mish', -1,
+            2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 4, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+    ]
+}
+
+LARGECSP53 = {
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 100,
+        'neck_split': 135
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'mish', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'csp', 1, True, 64, None, None, None, None, 'mish', -1,
+            1, 1, False
+        ],
+        [
+            'DarkRes', 'csp', 3, False, 128, None, None, None, None, 'mish', -1,
+            1, 2, False
+        ],
+        [
+            'DarkRes', 'csp', 15, False, 256, None, None, None, None, 'mish',
+            -1, 1, 3, True
+        ],
+        [
+            'DarkRes', 'csp', 15, False, 512, None, None, None, None, 'mish',
+            -1, 2, 4, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 4, 5, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 8, 6, True
+        ],
+        [
+            'DarkRes', 'csp', 7, False, 1024, None, None, None, None, 'mish',
+            -1, 16, 7, True
+        ],
     ]
 }
 
 DARKNET53 = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 76},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 1, "same", "leaky", -1, 0, False],
-        ["DarkRes", "residual", 1, True, 64, None, None, None, None, "leaky", -1, 1, False],
-        ["DarkRes", "residual", 2, False, 128, None, None, None, None, "leaky", -1, 2, False],
-        ["DarkRes", "residual", 8, False, 256, None, None, None, None, "leaky", -1, 3, True],
-        ["DarkRes", "residual", 8, False, 512, None, None, None, None, "leaky", -1, 4, True],
-        ["DarkRes", "residual", 4, False, 1024, None, None, None, None, "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 76
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 1, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkRes', 'residual', 1, True, 64, None, None, None, None, 'leaky',
+            -1, 1, 1, False
+        ],
+        [
+            'DarkRes', 'residual', 2, False, 128, None, None, None, None,
+            'leaky', -1, 1, 2, False
+        ],
+        [
+            'DarkRes', 'residual', 8, False, 256, None, None, None, None,
+            'leaky', -1, 1, 3, True
+        ],
+        [
+            'DarkRes', 'residual', 8, False, 512, None, None, None, None,
+            'leaky', -1, 2, 4, True
+        ],
+        [
+            'DarkRes', 'residual', 4, False, 1024, None, None, None, None,
+            'leaky', -1, 4, 5, True
+        ],
     ]
 }
 
 CSPDARKNETTINY = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 28},
-    "backbone": [
-        ["ConvBN", None, 1, False, 32, None, 3, 2, "same", "leaky", -1, 0, False],
-        ["ConvBN", None, 1, False, 64, None, 3, 2, "same", "leaky", -1, 1, False],
-        ["CSPTiny", "csp_tiny", 1, False, 64, None, 3, 2, "same", "leaky", -1, 2, False],
-        ["CSPTiny", "csp_tiny", 1, False, 128, None, 3, 2, "same", "leaky", -1, 3, False],
-        ["CSPTiny", "csp_tiny", 1, False, 256, None, 3, 2, "same", "leaky", -1, 4, True],
-        ["ConvBN", None, 1, False, 512, None, 3, 1, "same", "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 28
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 32, None, 3, 2, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'ConvBN', None, 1, False, 64, None, 3, 2, 'same', 'leaky', -1, 1, 1,
+            False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 64, None, 3, 2, 'same', 'leaky',
+            -1, 1, 2, False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 128, None, 3, 2, 'same', 'leaky',
+            -1, 1, 3, False
+        ],
+        [
+            'CSPTiny', 'csp_tiny', 1, False, 256, None, 3, 2, 'same', 'leaky',
+            -1, 1, 4, True
+        ],
+        [
+            'ConvBN', None, 1, False, 512, None, 3, 1, 'same', 'leaky', -1, 1,
+            5, True
+        ],
     ]
 }
 
 DARKNETTINY = {
-    "list_names": LISTNAMES,
-    "splits": {"backbone_split": 14},
-    "backbone": [
-        ["ConvBN", None, 1, False, 16, None, 3, 1, "same", "leaky", -1, 0, False],
-        ["DarkTiny", "tiny", 1, True, 32, None, 3, 2, "same", "leaky", -1, 1, False],
-        ["DarkTiny", "tiny", 1, True, 64, None, 3, 2, "same", "leaky", -1, 2, False],
-        ["DarkTiny", "tiny", 1, False, 128, None, 3, 2, "same", "leaky", -1, 3, False],
-        ["DarkTiny", "tiny", 1, False, 256, None, 3, 2, "same", "leaky", -1, 4, True],
-        ["DarkTiny", "tiny", 1, False, 512, None, 3, 2, "same", "leaky", -1, 5, False],
-        ["DarkTiny", "tiny", 1, False, 1024, None, 3, 1, "same", "leaky", -1, 5, True],
+    'list_names':
+        LISTNAMES,
+    'splits': {
+        'backbone_split': 14
+    },
+    'backbone': [
+        [
+            'ConvBN', None, 1, False, 16, None, 3, 1, 'same', 'leaky', -1, 1, 0,
+            False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, True, 32, None, 3, 2, 'same', 'leaky', -1, 1,
+            1, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, True, 64, None, 3, 2, 'same', 'leaky', -1, 1,
+            2, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 128, None, 3, 2, 'same', 'leaky', -1,
+            1, 3, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 256, None, 3, 2, 'same', 'leaky', -1,
+            1, 4, True
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 512, None, 3, 2, 'same', 'leaky', -1,
+            1, 5, False
+        ],
+        [
+            'DarkTiny', 'tiny', 1, False, 1024, None, 3, 1, 'same', 'leaky', -1,
+            1, 5, True
+        ],
     ]
 }
-# pylint: enable=line-too-long
 
 BACKBONES = {
-    "darknettiny": DARKNETTINY,
-    "darknet53": DARKNET53,
-    "cspdarknet53": CSPDARKNET53,
-    "cspdarknettiny": CSPDARKNETTINY
+    'darknettiny': DARKNETTINY,
+    'darknet53': DARKNET53,
+    'cspdarknet53': CSPDARKNET53,
+    'altered_cspdarknet53': CSPADARKNET53,
+    'cspdarknettiny': CSPDARKNETTINY,
+    'csp-large': LARGECSP53,
 }
 
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class Darknet(tf.keras.Model):
-  """Darknet backbone."""
+  """ The Darknet backbone architecture """
 
   def __init__(
       self,
-      model_id="darknet53",
+      model_id='darknet53',
       input_specs=tf.keras.layers.InputSpec(shape=[None, None, None, 3]),
       min_level=None,
       max_level=5,
+      width_scale=1.0,
+      depth_scale=1.0,
+      csp_level_mod=(),
       activation=None,
       use_sync_bn=False,
       norm_momentum=0.99,
       norm_epsilon=0.001,
-      kernel_initializer="glorot_uniform",
+      dilate=False,
+      kernel_initializer='glorot_uniform',
       kernel_regularizer=None,
       bias_regularizer=None,
       **kwargs):
@@ -227,12 +403,13 @@ def __init__(
     self._model_name = model_id
     self._splits = splits
     self._input_shape = input_specs
-    self._registry = LayerFactory()
+    self._registry = LayerBuilder()
 
     # default layer look up
     self._min_size = min_level
     self._max_size = max_level
     self._output_specs = None
+    self._csp_level_mod = set(csp_level_mod)
 
     self._kernel_initializer = kernel_initializer
     self._bias_regularizer = bias_regularizer
@@ -241,16 +418,20 @@ def __init__(
     self._use_sync_bn = use_sync_bn
     self._activation = activation
     self._kernel_regularizer = kernel_regularizer
+    self._dilate = dilate
+    self._width_scale = width_scale
+    self._depth_scale = depth_scale
 
     self._default_dict = {
-        "kernel_initializer": self._kernel_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "bias_regularizer": self._bias_regularizer,
-        "norm_momentum": self._norm_momentum,
-        "norm_epsilon": self._norm_epislon,
-        "use_sync_bn": self._use_sync_bn,
-        "activation": self._activation,
-        "name": None
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epislon,
+        'use_sync_bn': self._use_sync_bn,
+        'activation': self._activation,
+        'dilation_rate': 1,
+        'name': None
     }
 
     inputs = tf.keras.layers.Input(shape=self._input_shape.shape[1:])
@@ -273,33 +454,39 @@ def _build_struct(self, net, inputs):
     endpoints = collections.OrderedDict()
     stack_outputs = [inputs]
     for i, config in enumerate(net):
+      if config.output_name > self._max_size:
+        break
+      if config.output_name in self._csp_level_mod:
+        config.stack = 'residual'
+
+      config.filters = int(config.filters * self._width_scale)
+      config.repetitions = int(config.repetitions * self._depth_scale)
+
       if config.stack is None:
-        x = self._build_block(stack_outputs[config.route],
-                              config,
-                              name=f"{config.layer}_{i}")
+        x = self._build_block(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
         stack_outputs.append(x)
-      elif config.stack == "residual":
-        x = self._residual_stack(stack_outputs[config.route],
-                                 config,
-                                 name=f"{config.layer}_{i}")
+      elif config.stack == 'residual':
+        x = self._residual_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
         stack_outputs.append(x)
-      elif config.stack == "csp":
-        x = self._csp_stack(stack_outputs[config.route],
-                            config,
-                            name=f"{config.layer}_{i}")
+      elif config.stack == 'csp':
+        x = self._csp_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
         stack_outputs.append(x)
-      elif config.stack == "csp_tiny":
-        x_pass, x = self._csp_tiny_stack(stack_outputs[config.route],
-                                         config, name=f"{config.layer}_{i}")
+      elif config.stack == 'csp_tiny':
+        x_pass, x = self._csp_tiny_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
         stack_outputs.append(x_pass)
-      elif config.stack == "tiny":
-        x = self._tiny_stack(stack_outputs[config.route],
-                             config,
-                             name=f"{config.layer}_{i}")
+      elif config.stack == 'tiny':
+        x = self._tiny_stack(
+            stack_outputs[config.route], config, name=f'{config.layer}_{i}')
         stack_outputs.append(x)
       if (config.is_output and self._min_size is None):
         endpoints[str(config.output_name)] = x
-      elif self._min_size is not None and config.output_name >= self._min_size and config.output_name <= self._max_size:
+      elif (self._min_size is not None and
+            config.output_name >= self._min_size and
+            config.output_name <= self._max_size):
         endpoints[str(config.output_name)] = x
 
     self._output_specs = {l: endpoints[l].get_shape() for l in endpoints.keys()}
@@ -308,8 +495,7 @@ def _build_struct(self, net, inputs):
   def _get_activation(self, activation):
     if self._activation is None:
       return activation
-    else:
-      return self._activation
+    return self._activation
 
   def _csp_stack(self, inputs, config, name):
     if config.bottleneck:
@@ -320,86 +506,135 @@ def _csp_stack(self, inputs, config, name):
       csp_filter_scale = 2
       residual_filter_scale = 1
       scale_filters = 2
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_csp_down"
-    x, x_route = nn_blocks.CSPRoute(filters=config.filters,
-                                    filter_scale=csp_filter_scale,
-                                    downsample=True,
-                                    **self._default_dict)(inputs)
-    for i in range(config.repetitions):
-      self._default_dict["name"] = f"{name}_{i}"
-      x = nn_blocks.DarkResidual(filters=config.filters // scale_filters,
-                                 filter_scale=residual_filter_scale,
-                                 **self._default_dict)(x)
-
-    self._default_dict["name"] = f"{name}_csp_connect"
-    output = nn_blocks.CSPConnect(filters=config.filters,
-                                  filter_scale=csp_filter_scale,
-                                  **self._default_dict)([x, x_route])
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_csp_down'
+    if self._dilate:
+      self._default_dict['dilation_rate'] = config.dilation_rate
+    else:
+      self._default_dict['dilation_rate'] = 1
+
+    # swap/add dilation
+    x, x_route = nn_blocks.CSPRoute(
+        filters=config.filters,
+        filter_scale=csp_filter_scale,
+        downsample=True,
+        **self._default_dict)(
+            inputs)
+
+    dilated_reps = config.repetitions - self._default_dict['dilation_rate'] // 2
+    for i in range(dilated_reps):
+      self._default_dict['name'] = f'{name}_{i}'
+      x = nn_blocks.DarkResidual(
+          filters=config.filters // scale_filters,
+          filter_scale=residual_filter_scale,
+          **self._default_dict)(
+              x)
+
+    for i in range(dilated_reps, config.repetitions):
+      self._default_dict[
+          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+      self._default_dict[
+          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
+      x = nn_blocks.DarkResidual(
+          filters=config.filters // scale_filters,
+          filter_scale=residual_filter_scale,
+          **self._default_dict)(
+              x)
+
+    self._default_dict['name'] = f'{name}_csp_connect'
+    output = nn_blocks.CSPConnect(
+        filters=config.filters,
+        filter_scale=csp_filter_scale,
+        **self._default_dict)([x, x_route])
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
     return output
 
   def _csp_tiny_stack(self, inputs, config, name):
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_csp_tiny"
-    x, x_route = nn_blocks.CSPTiny(filters=config.filters,
-                                   **self._default_dict)(inputs)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_csp_tiny'
+    x, x_route = nn_blocks.CSPTiny(
+        filters=config.filters, **self._default_dict)(
+            inputs)
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
     return x, x_route
 
   def _tiny_stack(self, inputs, config, name):
-    x = tf.keras.layers.MaxPool2D(pool_size=2,
-                                  strides=config.strides,
-                                  padding="same",
-                                  data_format=None,
-                                  name=f"{name}_tiny/pool")(inputs)
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_tiny/conv"
+    x = tf.keras.layers.MaxPool2D(
+        pool_size=2,
+        strides=config.strides,
+        padding='same',
+        data_format=None,
+        name=f'{name}_tiny/pool')(
+            inputs)
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_tiny/conv'
     x = nn_blocks.ConvBN(
         filters=config.filters,
         kernel_size=(3, 3),
         strides=(1, 1),
-        padding="same",
+        padding='same',
         **self._default_dict)(
             x)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
     return x
 
   def _residual_stack(self, inputs, config, name):
-    self._default_dict["activation"] = self._get_activation(config.activation)
-    self._default_dict["name"] = f"{name}_residual_down"
-    x = nn_blocks.DarkResidual(filters=config.filters,
-                               downsample=True,
-                               **self._default_dict)(inputs)
-    for i in range(config.repetitions - 1):
-      self._default_dict["name"] = f"{name}_{i}"
-      x = nn_blocks.DarkResidual(filters=config.filters,
-                                 **self._default_dict)(x)
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._get_activation(config.activation)
+    self._default_dict['name'] = f'{name}_residual_down'
+    if self._dilate:
+      self._default_dict['dilation_rate'] = config.dilation_rate
+      if config.repetitions < 8:
+        config.repetitions += 2
+    else:
+      self._default_dict['dilation_rate'] = 1
+
+    x = nn_blocks.DarkResidual(
+        filters=config.filters, downsample=True, **self._default_dict)(
+            inputs)
+
+    dilated_reps = config.repetitions - \
+        (self._default_dict['dilation_rate'] // 2) - 1
+    for i in range(dilated_reps):
+      self._default_dict['name'] = f'{name}_{i}'
+      x = nn_blocks.DarkResidual(
+          filters=config.filters, **self._default_dict)(
+              x)
+
+    for i in range(dilated_reps, config.repetitions - 1):
+      self._default_dict[
+          'dilation_rate'] = self._default_dict['dilation_rate'] // 2
+      self._default_dict[
+          'name'] = f"{name}_{i}_degridded_{self._default_dict['dilation_rate']}"
+      x = nn_blocks.DarkResidual(
+          filters=config.filters, **self._default_dict)(
+              x)
+
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
+    self._default_dict['dilation_rate'] = 1
     return x
 
   def _build_block(self, inputs, config, name):
     x = inputs
     i = 0
-    self._default_dict["activation"] = self._get_activation(config.activation)
+    self._default_dict['activation'] = self._get_activation(config.activation)
     while i < config.repetitions:
-      self._default_dict["name"] = f"{name}_{i}"
+      self._default_dict['name'] = f'{name}_{i}'
       layer = self._registry(config, self._default_dict)
       x = layer(x)
       i += 1
-    self._default_dict["activation"] = self._activation
-    self._default_dict["name"] = None
+    self._default_dict['activation'] = self._activation
+    self._default_dict['name'] = None
     return x
 
   @staticmethod
   def get_model_config(name):
     name = name.lower()
-    backbone = BACKBONES[name]["backbone"]
-    splits = BACKBONES[name]["splits"]
+    backbone = BACKBONES[name]['backbone']
+    splits = BACKBONES[name]['splits']
     return build_block_specs(backbone), splits
 
   @property
@@ -412,35 +647,42 @@ def from_config(cls, config, custom_objects=None):
 
   def get_config(self):
     layer_config = {
-        "model_id": self._model_name,
-        "min_level": self._min_size,
-        "max_level": self._max_size,
-        "kernel_initializer": self._kernel_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "bias_regularizer": self._bias_regularizer,
-        "norm_momentum": self._norm_momentum,
-        "norm_epsilon": self._norm_epislon,
-        "use_sync_bn": self._use_sync_bn,
-        "activation": self._activation
+        'model_id': self._model_name,
+        'min_level': self._min_size,
+        'max_level': self._max_size,
+        'kernel_initializer': self._kernel_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'bias_regularizer': self._bias_regularizer,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epislon,
+        'use_sync_bn': self._use_sync_bn,
+        'activation': self._activation,
     }
     return layer_config
 
 
-@factory.register_backbone_builder("darknet")
+@factory.register_backbone_builder('darknet')
 def build_darknet(
     input_specs: tf.keras.layers.InputSpec,
     backbone_config: hyperparams.Config,
     norm_activation_config: hyperparams.Config,
     l2_regularizer: tf.keras.regularizers.Regularizer = None) -> tf.keras.Model:
-  """Builds darknet backbone."""
 
-  backbone_cfg = backbone_config.get()
+  backbone_cfg = model_config.backbone.get()
+  norm_activation_config = model_config.norm_activation
+
   model = Darknet(
       model_id=backbone_cfg.model_id,
-      input_shape=input_specs,
+      min_level=model_config.min_level,
+      max_level=model_config.max_level,
+      input_specs=input_specs,
+      dilate=backbone_cfg.dilate,
+      width_scale=backbone_cfg.width_scale,
+      depth_scale=backbone_cfg.depth_scale,
       activation=norm_activation_config.activation,
       use_sync_bn=norm_activation_config.use_sync_bn,
       norm_momentum=norm_activation_config.norm_momentum,
       norm_epsilon=norm_activation_config.norm_epsilon,
       kernel_regularizer=l2_regularizer)
+  model.summary()
   return model
diff --git a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
index 76c595f2dd7..8678c2c93e6 100644
--- a/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
+++ b/official/vision/beta/projects/yolo/modeling/backbones/darknet_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 # Lint as: python3
-"""Tests for resnet."""
+"""Tests for yolo."""
 
 from absl.testing import parameterized
 import numpy as np
@@ -24,40 +24,53 @@
 from official.vision.beta.projects.yolo.modeling.backbones import darknet
 
 
-class DarkNetTest(parameterized.TestCase, tf.test.TestCase):
+class DarknetTest(parameterized.TestCase, tf.test.TestCase):
 
   @parameterized.parameters(
-      (224, "darknet53", 2, 1),
-      (224, "darknettiny", 1, 2),
-      (224, "cspdarknettiny", 1, 1),
-      (224, "cspdarknet53", 2, 1),
+      (224, 'darknet53', 2, 1, True),
+      (224, 'darknettiny', 1, 2, False),
+      (224, 'cspdarknettiny', 1, 1, False),
+      (224, 'cspdarknet53', 2, 1, True),
   )
-  def test_network_creation(self, input_size, model_id,
-                            endpoint_filter_scale, scale_final):
+  def test_network_creation(self, input_size, model_id, endpoint_filter_scale,
+                            scale_final, dilate):
     """Test creation of ResNet family models."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')
 
-    network = darknet.Darknet(model_id=model_id, min_level=3, max_level=5)
+    network = darknet.Darknet(
+        model_id=model_id, min_level=3, max_level=5, dilate=dilate)
     self.assertEqual(network.model_id, model_id)
 
     inputs = tf.keras.Input(shape=(input_size, input_size, 3), batch_size=1)
     endpoints = network(inputs)
 
-    self.assertAllEqual(
-        [1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale],
-        endpoints["3"].shape.as_list())
-    self.assertAllEqual(
-        [1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale],
-        endpoints["4"].shape.as_list())
-    self.assertAllEqual([
-        1, input_size / 2**5, input_size / 2**5,
-        512 * endpoint_filter_scale * scale_final
-    ], endpoints["5"].shape.as_list())
+    if dilate:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())
+    else:
+      self.assertAllEqual([
+          1, input_size / 2**3, input_size / 2**3, 128 * endpoint_filter_scale
+      ], endpoints['3'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**4, input_size / 2**4, 256 * endpoint_filter_scale
+      ], endpoints['4'].shape.as_list())
+      self.assertAllEqual([
+          1, input_size / 2**5, input_size / 2**5,
+          512 * endpoint_filter_scale * scale_final
+      ], endpoints['5'].shape.as_list())
 
   @combinations.generate(
       combinations.combine(
           strategy=[
-              strategy_combinations.cloud_tpu_strategy,
+              strategy_combinations.tpu_strategy,
               strategy_combinations.one_device_strategy_gpu,
           ],
           use_sync_bn=[False, True],
@@ -66,20 +79,20 @@ def test_sync_bn_multiple_devices(self, strategy, use_sync_bn):
     """Test for sync bn on TPU and GPU devices."""
     inputs = np.random.rand(1, 224, 224, 3)
 
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')
 
     with strategy.scope():
-      network = darknet.Darknet(model_id="darknet53", min_size=3, max_size=5)
+      network = darknet.Darknet(model_id='darknet53', min_size=3, max_size=5)
       _ = network(inputs)
 
   @parameterized.parameters(1, 3, 4)
   def test_input_specs(self, input_dim):
     """Test different input feature dimensions."""
-    tf.keras.backend.set_image_data_format("channels_last")
+    tf.keras.backend.set_image_data_format('channels_last')
 
     input_specs = tf.keras.layers.InputSpec(shape=[None, None, None, input_dim])
     network = darknet.Darknet(
-        model_id="darknet53", min_level=3, max_level=5, input_specs=input_specs)
+        model_id='darknet53', min_level=3, max_level=5, input_specs=input_specs)
 
     inputs = tf.keras.Input(shape=(224, 224, input_dim), batch_size=1)
     _ = network(inputs)
@@ -87,14 +100,14 @@ def test_input_specs(self, input_dim):
   def test_serialize_deserialize(self):
     # Create a network object that sets all of its config options.
     kwargs = dict(
-        model_id="darknet53",
+        model_id='darknet53',
         min_level=3,
         max_level=5,
         use_sync_bn=False,
-        activation="relu",
+        activation='relu',
         norm_momentum=0.99,
         norm_epsilon=0.001,
-        kernel_initializer="VarianceScaling",
+        kernel_initializer='VarianceScaling',
         kernel_regularizer=None,
         bias_regularizer=None,
     )
@@ -113,5 +126,5 @@ def test_serialize_deserialize(self):
     self.assertAllEqual(network.get_config(), new_network.get_config())
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   tf.test.main()
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/__init__.py b/official/vision/beta/projects/yolo/modeling/decoders/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
new file mode 100644
index 00000000000..1dbaae6ebf1
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -0,0 +1,484 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Feature Pyramid Network and Path Aggregation variants used in YOLO"""
+
+import tensorflow as tf
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class _IdentityRoute(tf.keras.layers.Layer):
+
+  def __init__(self, **kwargs):
+    """Private class to mirror the outputs of blocks in nn_blocks for an easier
+    programatic generation of the feature pyramid network"""
+
+    super().__init__(**kwargs)
+
+  def call(self, inputs): # pylint: disable=arguments-differ
+    return None, inputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloFPN(tf.keras.layers.Layer):
+  """YOLO Feature pyramid network."""
+
+  def __init__(self,
+               fpn_depth=4,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               fpn_filter_scale=1,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo FPN initialization function (Yolo V4).
+
+    Args:
+      fpn_depth: `int`, number of layers to use in each FPN path
+        if you choose to use an FPN.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization momentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+    self._fpn_depth = fpn_depth
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._use_spatial_attention = use_spatial_attention
+    self._filter_scale = fpn_filter_scale
+    self._csp_stack = csp_stack
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth (int): depth of the smallest branch of the FPN.
+      inputs (dict): dictionary of the shape of input args as a dictionary of
+        lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+
+    depths = []
+    for i in range(self._min_level, self._max_level + 1):
+      depths.append(inputs[str(i)][-1] / self._filter_scale)
+    return list(reversed(depths))
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head
+    construction.
+
+    Args:
+       inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+    # set of convoltion layers and upsample layers that are used to
+    # prepare the FPN processors for output
+
+    for level, depth in zip(
+        reversed(range(self._min_level, self._max_level + 1)), self._depths):
+      if level == self._min_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=self._csp_stack == 0,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = _IdentityRoute()
+      elif level != self._max_level:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=depth // 2,
+            inverted=True,
+            upsample=True,
+            drop_final=False,
+            upsample_size=2,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth - int(level == self._min_level),
+            block_invert=True,
+            insert_spp=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=depth,
+            repetitions=self._fpn_depth + 1 * int(self._csp_stack == 0),
+            insert_spp=True,
+            block_invert=False,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._max_level)]
+    for level in reversed(range(self._min_level, self._max_level + 1)):
+      _, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if level > self._min_level:
+        x_next = inputs[str(level - 1)]
+        _, layer_in = self.resamples[str(level - 1)]([x_next, x])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloPAN(tf.keras.layers.Layer):
+  """YOLO Path Aggregation Network"""
+
+  def __init__(self,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               fpn_input=True,
+               fpn_filter_scale=1.0,
+               **kwargs):
+    """Yolo Path Aggregation Network initialization function (Yolo V3 and V4).
+
+    Args:
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing
+        by zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      fpn_input: `bool`, for whether the input into this fucntion is an FPN or
+        a backbone.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+
+    self._path_process_len = path_process_len
+    self._embed_spp = embed_spp
+    self._use_spatial_attention = use_spatial_attention
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+    self._fpn_input = fpn_input
+    self._max_level_process_len = max_level_process_len
+    self._csp_stack = csp_stack
+    self._fpn_filter_scale = fpn_filter_scale
+
+    if max_level_process_len is None:
+      self._max_level_process_len = path_process_len
+
+    self._base_config = dict(
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        kernel_regularizer=self._kernel_regularizer,
+        kernel_initializer=self._kernel_initializer,
+        bias_regularizer=self._bias_regularizer,
+        norm_epsilon=self._norm_epsilon,
+        norm_momentum=self._norm_momentum)
+
+  def build(self, inputs):
+    """Use config dictionary to generate all important attributes for head
+    construction.
+
+    Args:
+      inputs: dictionary of the shape of input args as a dictionary of lists.
+    """
+
+    # define the key order
+    keys = [int(key) for key in inputs.keys()]
+    self._min_level = min(keys)
+    self._max_level = max(keys)
+    self._min_depth = inputs[str(self._min_level)][-1]
+    self._depths = self.get_raw_depths(self._min_depth, inputs)
+
+    # directly connect to an input path and process it
+    self.preprocessors = dict()
+    # resample an input and merge it with the output of another path
+    # inorder to aggregate backbone outputs
+    self.resamples = dict()
+
+    # FPN will reverse the key process order for the backbone, so we need
+    # adjust the order that objects are created and processed to adjust for
+    # this. not using an FPN will directly connect the decoder to the backbone
+    # therefore the object creation order needs to be done from the largest
+    # to smallest level.
+    if self._fpn_input:
+      # process order {... 3, 4, 5}
+      self._iterator = range(self._min_level, self._max_level + 1)
+      self._check = lambda x: x < self._max_level
+      self._key_shift = lambda x: x + 1
+      self._input = self._min_level
+      downsample = True
+      upsample = False
+    else:
+      # process order {5, 4, 3, ...}
+      self._iterator = list(
+          reversed(range(self._min_level, self._max_level + 1)))
+      self._check = lambda x: x > self._min_level
+      self._key_shift = lambda x: x - 1
+      self._input = self._max_level
+      downsample = False
+      upsample = True
+
+    if self._csp_stack == 0:
+      proc_filters = lambda x: x
+      resample_filters = lambda x: x // 2
+    else:
+      proc_filters = lambda x: x * 2
+      resample_filters = lambda x: x
+    for level, depth in zip(self._iterator, self._depths):
+      if level == self._input:
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._max_level_process_len,
+            insert_spp=self._embed_spp,
+            block_invert=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+      else:
+        self.resamples[str(level)] = nn_blocks.PathAggregationBlock(
+            filters=resample_filters(depth),
+            upsample=upsample,
+            downsample=downsample,
+            inverted=False,
+            drop_final=self._csp_stack == 0,
+            **self._base_config)
+        self.preprocessors[str(level)] = nn_blocks.DarkRouteProcess(
+            filters=proc_filters(depth),
+            repetitions=self._path_process_len,
+            insert_spp=False,
+            insert_sam=self._use_spatial_attention,
+            csp_stack=self._csp_stack,
+            **self._base_config)
+
+  def get_raw_depths(self, minimum_depth, inputs):
+    """Calculates the unscaled depths of the FPN branches.
+
+    Args:
+      minimum_depth: `int` depth of the smallest branch of the FPN.
+      inputs: `dict[str, tf.InputSpec]` of the shape of input args as a dictionary of
+        lists.
+
+    Returns:
+      The unscaled depths of the FPN branches.
+    """
+    
+    depths = []
+    if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
+      for i in range(self._min_level, self._max_level + 1):
+        depths.append(inputs[str(i)][-1] * 2)
+    else:
+      for _ in range(self._min_level, self._max_level + 1):
+        depths.append(minimum_depth)
+        minimum_depth *= 2
+    if self._fpn_input:
+      return depths
+    return list(reversed(depths))
+
+  def call(self, inputs):
+    outputs = dict()
+    layer_in = inputs[str(self._input)]
+
+    for level in self._iterator:
+      x_route, x = self.preprocessors[str(level)](layer_in)
+      outputs[str(level)] = x
+      if self._check(level):
+        x_next = inputs[str(self._key_shift(level))]
+        _, layer_in = self.resamples[str(
+            self._key_shift(level))]([x_route, x_next])
+    return outputs
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class YoloDecoder(tf.keras.Model):
+  """Darknet Backbone Decoder"""
+
+  def __init__(self,
+               input_specs,
+               use_fpn=False,
+               use_spatial_attention=False,
+               csp_stack=False,
+               fpn_depth=4,
+               fpn_filter_scale=1,
+               path_process_len=6,
+               max_level_process_len=None,
+               embed_spp=False,
+               activation='leaky',
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               **kwargs):
+    """Yolo Decoder initialization function. A unified model that ties all decoder
+    components into a conditionally build YOLO decder.
+
+    Args:
+      input_specs: `dict[str, tf.InputSpec]`: input specs of each of the inputs
+        to the heads.
+      use_fpn: `bool`, use the FPN found in the YoloV4 model.
+      use_spatial_attention: `bool`, use the spatial attention module.
+      csp_stack: `bool`, CSPize the FPN.
+      fpn_depth: `int`, number of layers ot use in each FPN path
+        if you choose to use an FPN.
+      fpn_filter_scale: `int`, scaling factor for the FPN filters.
+      path_process_len: `int`, number of layers ot use in each Decoder path.
+      max_level_process_len: `int`, number of layers ot use in the largest
+        processing path, or the backbones largest output if it is different.
+      embed_spp: `bool`, use the SPP found in the YoloV3 and V4 model.
+      activation: `str`, the activation function to use typically leaky or mish.
+      use_sync_bn: if True, use synchronized batch normalization.
+      norm_momentum: `float`, normalization omentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    self._input_specs = input_specs
+    self._use_fpn = use_fpn
+    self._fpn_depth = fpn_depth
+    self._path_process_len = path_process_len
+    self._max_level_process_len = max_level_process_len
+    self._embed_spp = embed_spp
+
+    self._activation = activation
+    self._use_sync_bn = use_sync_bn
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+    self._kernel_initializer = kernel_initializer
+    self._kernel_regularizer = kernel_regularizer
+    self._bias_regularizer = bias_regularizer
+
+    self._base_config = dict(
+        use_spatial_attention=use_spatial_attention,
+        csp_stack=csp_stack,
+        activation=self._activation,
+        use_sync_bn=self._use_sync_bn,
+        fpn_filter_scale=fpn_filter_scale,
+        norm_momentum=self._norm_momentum,
+        norm_epsilon=self._norm_epsilon,
+        kernel_initializer=self._kernel_initializer,
+        kernel_regularizer=self._kernel_regularizer,
+        bias_regularizer=self._bias_regularizer)
+
+    self._decoder_config = dict(
+        path_process_len=self._path_process_len,
+        max_level_process_len=self._max_level_process_len,
+        embed_spp=self._embed_spp,
+        fpn_input=self._use_fpn,
+        **self._base_config)
+
+    inputs = {
+        key: tf.keras.layers.Input(shape=value[1:])
+        for key, value in input_specs.items()
+    }
+    if self._use_fpn:
+      inter_outs = YoloFPN(
+          fpn_depth=self._fpn_depth, **self._base_config)(
+              inputs)
+      outputs = YoloPAN(**self._decoder_config)(inter_outs)
+    else:
+      inter_outs = None
+      outputs = YoloPAN(**self._decoder_config)(inputs)
+
+    self._output_specs = {key: value.shape for key, value in outputs.items()}
+    super().__init__(inputs=inputs, outputs=outputs, name='YoloDecoder')
+
+  @property
+  def use_fpn(self):
+    return self._use_fpn
+
+  @property
+  def output_specs(self):
+    return self._output_specs
+
+  def get_config(self):
+    config = dict(
+        input_specs=self._input_specs,
+        use_fpn=self._use_fpn,
+        fpn_depth=self._fpn_depth,
+        **self._decoder_config)
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
new file mode 100644
index 00000000000..236396ba79d
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder_test.py
@@ -0,0 +1,154 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for YOLO."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.projects.yolo.modeling.decoders import yolo_decoder as decoders
+
+
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  @parameterized.parameters('1', '6spp', '6sppfpn', '6')
+  def test_network_creation(self, version):
+    """Test creation of ResNet family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    decoder = build_yolo_decoder(input_shape, version)
+
+    inputs = {}
+    for key in input_shape.keys():
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    endpoints = decoder.call(inputs)
+
+    for key in endpoints.keys():
+      self.assertAllEqual(endpoints[key].shape.as_list(), input_shape[key])
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.one_device_strategy_gpu,
+          ],
+          use_sync_bn=[False, True],
+      ))
+  def test_sync_bn_multiple_devices(self, strategy, use_sync_bn):
+    """Test for sync bn on TPU and GPU devices."""
+
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    with strategy.scope():
+      input_shape = {
+          '3': [1, 52, 52, 256],
+          '4': [1, 26, 26, 512],
+          '5': [1, 13, 13, 1024]
+      }
+      decoder = build_yolo_decoder(input_shape, '6')
+
+      inputs = {}
+      for key in input_shape.keys():
+        inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+      _ = decoder.call(inputs)
+
+  @parameterized.parameters(1, 3, 4)
+  def test_input_specs(self, input_dim):
+    """Test different input feature dimensions."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    decoder = build_yolo_decoder(input_shape, '6')
+
+    inputs = {}
+    for key in input_shape.keys():
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+    _ = decoder(inputs)
+
+  def test_serialize_deserialize(self):
+    """Create a network object that sets all of its config options."""
+    tf.keras.backend.set_image_data_format('channels_last')
+
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    decoder = build_yolo_decoder(input_shape, '6')
+
+    inputs = {}
+    for key in input_shape.keys():
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    _ = decoder(inputs)
+    config = decoder.get_config()
+    decoder_from_config = decoders.YoloDecoder.from_config(config)
+    self.assertAllEqual(decoder.get_config(), decoder_from_config.get_config())
+
+
+def build_yolo_decoder(input_specs, type='1'):
+  if type == '1':
+    model = decoders.YoloDecoder(
+        input_specs=input_specs,
+        embed_spp=False,
+        use_fpn=False,
+        max_level_process_len=2,
+        path_process_len=1,
+        activation='mish')
+  elif type == '6spp':
+    model = decoders.YoloDecoder(
+        input_specs=input_specs,
+        embed_spp=True,
+        use_fpn=False,
+        max_level_process_len=None,
+        path_process_len=6,
+        activation='mish')
+  elif type == '6sppfpn':
+    model = decoders.YoloDecoder(
+        input_specs=input_specs,
+        embed_spp=True,
+        use_fpn=True,
+        max_level_process_len=None,
+        path_process_len=6,
+        activation='mish')
+  elif type == '6':
+    model = decoders.YoloDecoder(
+        input_specs=input_specs,
+        embed_spp=False,
+        use_fpn=False,
+        max_level_process_len=None,
+        path_process_len=6,
+        activation='mish')
+  else:
+    raise NotImplementedError(f"YOLO decoder test {type} not implemented.")
+  return model
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/official/vision/beta/projects/yolo/modeling/heads/__init__.py b/official/vision/beta/projects/yolo/modeling/heads/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
new file mode 100644
index 00000000000..4d7d082a00c
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head.py
@@ -0,0 +1,120 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+import tensorflow as tf
+from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
+
+
+class YoloHead(tf.keras.layers.Layer):
+  """YOLO Prediction Head"""
+
+  def __init__(self,
+               min_level,
+               max_level,
+               classes=80,
+               boxes_per_level=3,
+               output_extras=0,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               kernel_initializer='glorot_uniform',
+               kernel_regularizer=None,
+               bias_regularizer=None,
+               activation=None,
+               **kwargs):
+    """Yolo Prediction Head initialization function.
+
+    Args:
+      min_level: `int`, the minimum backbone output level.
+      max_level: `int`, the maximum backbone output level.
+      classes: `int`, number of classes per category.
+      boxes_per_level: `int`, number of boxes to predict per level.
+      output_extras: `int`, number of additional output channels that the head.
+        should predict for non-object detection and non-image classification
+        tasks.
+      norm_momentum: `float`, normalization momentum for the moving average.
+      norm_epsilon: `float`, small float added to variance to avoid dividing by
+        zero.
+      kernel_initializer: kernel_initializer for convolutional layers.
+      kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      activation: `str`, the activation function to use typically leaky or mish.
+      **kwargs: keyword arguments to be passed.
+    """
+
+    super().__init__(**kwargs)
+    self._min_level = min_level
+    self._max_level = max_level
+
+    self._key_list = [
+        str(key) for key in range(self._min_level, self._max_level + 1)
+    ]
+
+    self._classes = classes
+    self._boxes_per_level = boxes_per_level
+    self._output_extras = output_extras
+
+    self._output_conv = (classes + output_extras + 5) * boxes_per_level
+
+    self._base_config = dict(
+        activation=activation,
+        norm_momentum=norm_momentum,
+        norm_epsilon=norm_epsilon,
+        kernel_initializer=kernel_initializer,
+        kernel_regularizer=kernel_regularizer,
+        bias_regularizer=bias_regularizer)
+
+    self._conv_config = dict(
+        filters=self._output_conv,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='same',
+        use_bn=False,
+        **self._base_config)
+
+  def build(self, input_shape):
+    self._head = dict()
+    for key in self._key_list:
+      self._head[key] = nn_blocks.ConvBN(**self._conv_config)
+
+  def call(self, inputs):
+    outputs = dict()
+    for key in self._key_list:
+      outputs[key] = self._head[key](inputs[key])
+    return outputs
+
+  @property
+  def output_depth(self):
+    return (self._classes + self._output_extras + 5) * self._boxes_per_level
+
+  @property
+  def num_boxes(self):
+    if self._min_level is None or self._max_level is None:
+      raise Exception(
+          'Model has to be built before number of boxes can be determined.')
+    return (self._max_level - self._min_level + 1) * self._boxes_per_level
+
+  def get_config(self):
+    config = dict(
+        min_level=self._min_level,
+        max_level=self._max_level,
+        classes=self._classes,
+        boxes_per_level=self._boxes_per_level,
+        output_extras=self._output_extras,
+        **self._base_config)
+    return config
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    return cls(**config)
diff --git a/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py b/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py
new file mode 100644
index 00000000000..422f1a9a8e2
--- /dev/null
+++ b/official/vision/beta/projects/yolo/modeling/heads/yolo_head_test.py
@@ -0,0 +1,77 @@
+# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Lint as: python3
+"""Tests for yolo heads."""
+
+# Import libraries
+from absl.testing import parameterized
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from official.vision.beta.projects.yolo.modeling.heads import yolo_head as heads
+
+
+class YoloDecoderTest(parameterized.TestCase, tf.test.TestCase):
+
+  def test_network_creation(self):
+    """Test creation of YOLO family models."""
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    classes = 100
+    bps = 3
+    head = heads.YoloHead(3, 5, classes=classes, boxes_per_level=bps)
+
+    inputs = {}
+    for key in input_shape.keys():
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    endpoints = head(inputs)
+    # print(endpoints)
+
+    for key in endpoints.keys():
+      expected_input_shape = input_shape[key]
+      expected_input_shape[-1] = (classes + 5) * bps
+      self.assertAllEqual(endpoints[key].shape.as_list(), expected_input_shape)
+
+  def test_serialize_deserialize(self):
+    # Create a network object that sets all of its config options.
+    tf.keras.backend.set_image_data_format('channels_last')
+    input_shape = {
+        '3': [1, 52, 52, 256],
+        '4': [1, 26, 26, 512],
+        '5': [1, 13, 13, 1024]
+    }
+    classes = 100
+    bps = 3
+    head = heads.YoloHead(3, 5, classes=classes, boxes_per_level=bps)
+
+    inputs = {}
+    for key in input_shape.keys():
+      inputs[key] = tf.ones(input_shape[key], dtype=tf.float32)
+
+    _ = head(inputs)
+    configs = head.get_config()
+    head_from_config = heads.YoloHead.from_config(configs)
+    self.assertAllEqual(head.get_config(), head_from_config.get_config())
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
index 8bc6a78078a..9897def3ad3 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -15,62 +15,64 @@
 # Lint as: python3
 
 """Contains common building blocks for yolo neural networks."""
-
 from typing import Callable, List
 import tensorflow as tf
 from official.modeling import tf_utils
+from official.vision.beta.ops import spatial_transform_ops
 
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class Identity(tf.keras.layers.Layer):
 
+  def __init__(self, **kwargs):
+    super().__init__(**kwargs)
+
   def call(self, inputs):
     return inputs
 
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class ConvBN(tf.keras.layers.Layer):
-  """Modified Convolution layer to match that of the DarkNet Library.
-
+  """
+  Modified Convolution layer to match that of the Darknet Library.
   The Layer is a standards combination of Conv BatchNorm Activation,
-  however, the use of bias in the conv is determined by the use of batch norm.
-
+  however, the use of bias in the conv is determined by the use of batch
+  normalization.
   Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu, Ping-Yang
-  Chen, Jun-Wei Hsieh.
-  CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-  arXiv:1911.11929
+  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
+        Ping-Yang Chen, Jun-Wei Hsieh
+      CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
+        arXiv:1911.11929
   """
 
   def __init__(self,
                filters=1,
                kernel_size=(1, 1),
                strides=(1, 1),
-               padding="same",
+               padding='same',
                dilation_rate=(1, 1),
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                bias_regularizer=None,
+               kernel_regularizer=None,
                use_bn=True,
                use_sync_bn=False,
                norm_momentum=0.99,
                norm_epsilon=0.001,
-               activation="leaky",
+               activation='leaky',
                leaky_alpha=0.1,
                **kwargs):
-    """Initializes ConvBN layer.
-
+    """
     Args:
-      filters: integer for output depth, or the number of features to learn
+      filters: integer for output depth, or the number of features to learn.
       kernel_size: integer or tuple for the shape of the weight matrix or kernel
         to learn.
       strides: integer of tuple how much to move the kernel after each kernel
-        use padding: string 'valid' or 'same', if same, then pad the image, else
-        do not.
-      padding: `str`, padding method for conv layers.
-      dilation_rate: tuple to indicate how much to modulate kernel weights and
-                      how many pixels in a feature map to skip.
+        use.
+      padding: string 'valid' or 'same', if same, then pad the image, else do
+        not.
+      dialtion_rate: tuple to indicate how much to modulate kernel weights and
+        how many pixels in a feature map to skip.
       kernel_initializer: string to indicate which function to use to initialize
         weights.
       bias_initializer: string to indicate which function to use to initialize
@@ -80,14 +82,17 @@ def __init__(self,
       bias_regularizer: string to indicate which function to use to regularizer
         bias.
       use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization.
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
       activation: string or None for activation function to use in layer,
-                  if None activation is replaced by linear.
+        if None activation is replaced by linear.
       leaky_alpha: float to use as alpha if activation function is leaky.
-      **kwargs: Keyword Arguments
+      **kwargs: Keyword Arguments.
     """
+
     # convolution params
     self._filters = filters
     self._kernel_size = kernel_size
@@ -97,15 +102,16 @@ def __init__(self,
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     self._kernel_regularizer = kernel_regularizer
+
     self._bias_regularizer = bias_regularizer
 
     # batch normalization params
     self._use_bn = use_bn
     self._use_sync_bn = use_sync_bn
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
 
-    if tf.keras.backend.image_data_format() == "channels_last":
+    if tf.keras.backend.image_data_format() == 'channels_last':
       # format: (batch_size, height, width, channels)
       self._bn_axis = -1
     else:
@@ -116,7 +122,7 @@ def __init__(self,
     self._activation = activation
     self._leaky_alpha = leaky_alpha
 
-    super(ConvBN, self).__init__(**kwargs)
+    super().__init__(**kwargs)
 
   def build(self, input_shape):
     use_bias = not self._use_bn
@@ -136,101 +142,101 @@ def build(self, input_shape):
     if self._use_bn:
       if self._use_sync_bn:
         self.bn = tf.keras.layers.experimental.SyncBatchNormalization(
-            momentum=self._norm_moment,
+            momentum=self._norm_momentum,
             epsilon=self._norm_epsilon,
             axis=self._bn_axis)
       else:
         self.bn = tf.keras.layers.BatchNormalization(
-            momentum=self._norm_moment,
+            momentum=self._norm_momentum,
             epsilon=self._norm_epsilon,
             axis=self._bn_axis)
-    else:
-      self.bn = Identity()
 
-    if self._activation == "leaky":
+    if self._activation == 'leaky':
       self._activation_fn = tf.keras.layers.LeakyReLU(alpha=self._leaky_alpha)
-    elif self._activation == "mish":
+    elif self._activation == 'mish':
       self._activation_fn = lambda x: x * tf.math.tanh(tf.math.softplus(x))
     else:
       self._activation_fn = tf_utils.get_activation(self._activation)
 
   def call(self, x):
     x = self.conv(x)
-    x = self.bn(x)
+    if self._use_bn:
+      x = self.bn(x)
     x = self._activation_fn(x)
     return x
 
   def get_config(self):
     # used to store/share parameters to reconstruct the model
     layer_config = {
-        "filters": self._filters,
-        "kernel_size": self._kernel_size,
-        "strides": self._strides,
-        "padding": self._padding,
-        "dilation_rate": self._dilation_rate,
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_moment": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._activation,
-        "leaky_alpha": self._leaky_alpha
+        'filters': self._filters,
+        'kernel_size': self._kernel_size,
+        'strides': self._strides,
+        'padding': self._padding,
+        'dilation_rate': self._dilation_rate,
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._activation,
+        'leaky_alpha': self._leaky_alpha
     }
-    layer_config.update(super(ConvBN, self).get_config())
+    layer_config.update(super().get_config())
     return layer_config
 
-  def __repr__(self):
-    return repr(self.get_config())
-
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class DarkResidual(tf.keras.layers.Layer):
-  """DarkNet block with Residual connection for Yolo v3 Backbone.
+  """
+  Darknet block with Residual connection for Yolo v3 Backbone
   """
 
   def __init__(self,
                filters=1,
                filter_scale=2,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
+               dilation_rate=1,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                kernel_regularizer=None,
                bias_regularizer=None,
                use_bn=True,
                use_sync_bn=False,
                norm_momentum=0.99,
                norm_epsilon=0.001,
-               activation="leaky",
+               activation='leaky',
                leaky_alpha=0.1,
-               sc_activation="linear",
+               sc_activation='linear',
                downsample=False,
                **kwargs):
-    """Initializes DarkResidual.
-
+    """
     Args:
       filters: integer for output depth, or the number of features to learn.
-      filter_scale: `int`, scale factor for number of filters.
       kernel_initializer: string to indicate which function to use to initialize
-        weights
+        weights.
       bias_initializer: string to indicate which function to use to initialize
-        bias
+        bias.
       kernel_regularizer: string to indicate which function to use to
-        regularizer weights
+        regularizer weights.
       bias_regularizer: string to indicate which function to use to regularizer
-        bias
-      use_bn: boolean for whether to use batch normalization
-      use_sync_bn: boolean for whether sync batch normalization.
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
-      activation: string for activation function to use in conv layers.
-      leaky_alpha: float to use as alpha if activation function is leaky
-      sc_activation: string for activation function to use in layer
+        bias.
+      use_bn: boolean for whether to use batch normalization.
+      use_sync_bn: boolean for whether sync batch normalization statistics.
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      conv_activation: string or None for activation function to use in layer,
+        if None activation is replaced by linear.
+      leaky_alpha: float to use as alpha if activation function is leaky.
+      sc_activation: string for activation function to use in layer.
       downsample: boolean for if image input is larger than layer output, set
-        downsample to True so the dimensions are forced to match
-      **kwargs: Keyword Arguments
+        downsample to True so the dimensions are forced to match.
+      **kwargs: Keyword Arguments.
     """
+
     # downsample
     self._downsample = downsample
 
@@ -245,8 +251,10 @@ def __init__(self,
     self._kernel_regularizer = kernel_regularizer
 
     # normal params
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
+    self._dilation_rate = dilation_rate if isinstance(dilation_rate,
+                                                      int) else dilation_rate[0]
 
     # activation params
     self._conv_activation = activation
@@ -256,138 +264,150 @@ def __init__(self,
     super().__init__(**kwargs)
 
   def build(self, input_shape):
-    self._dark_conv_args = {
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_momentum": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._conv_activation,
-        "kernel_regularizer": self._kernel_regularizer,
-        "leaky_alpha": self._leaky_alpha
+    dark_conv_args = {
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._conv_activation,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha
     }
     if self._downsample:
+      if self._dilation_rate > 1:
+        dilation_rate = 1
+        if self._dilation_rate // 2 > 0:
+          dilation_rate = self._dilation_rate // 2
+        down_stride = 1
+      else:
+        dilation_rate = 1
+        down_stride = 2
+
       self._dconv = ConvBN(
           filters=self._filters,
           kernel_size=(3, 3),
-          strides=(2, 2),
-          padding="same",
-          **self._dark_conv_args)
-    else:
-      self._dconv = Identity()
+          strides=down_stride,
+          dilation_rate=dilation_rate,
+          padding='same',
+          **dark_conv_args)
 
     self._conv1 = ConvBN(
         filters=self._filters // self._filter_scale,
         kernel_size=(1, 1),
         strides=(1, 1),
-        padding="same",
-        **self._dark_conv_args)
+        padding='same',
+        **dark_conv_args)
 
     self._conv2 = ConvBN(
         filters=self._filters,
         kernel_size=(3, 3),
         strides=(1, 1),
-        padding="same",
-        **self._dark_conv_args)
+        dilation_rate=self._dilation_rate,
+        padding='same',
+        **dark_conv_args)
 
     self._shortcut = tf.keras.layers.Add()
-    if self._sc_activation == "leaky":
-      self._activation_fn = tf.keras.layers.LeakyReLU(
-          alpha=self._leaky_alpha)
-    elif self._sc_activation == "mish":
+    if self._sc_activation == 'leaky':
+      self._activation_fn = tf.keras.layers.LeakyReLU(alpha=self._leaky_alpha)
+    elif self._sc_activation == 'mish':
       self._activation_fn = lambda x: x * tf.math.tanh(tf.math.softplus(x))
     else:
-      self._activation_fn = tf_utils.get_activation(self._sc_activation)
+      self._activation_fn = tf_utils.get_activation(
+          self._sc_activation
+      )
     super().build(input_shape)
 
-  def call(self, inputs):
-    shortcut = self._dconv(inputs)
-    x = self._conv1(shortcut)
+  def call(self, inputs, training=None):
+    if self._downsample:
+      inputs = self._dconv(inputs)
+    x = self._conv1(inputs)
     x = self._conv2(x)
-    x = self._shortcut([x, shortcut])
+    x = self._shortcut([x, inputs])
     return self._activation_fn(x)
 
   def get_config(self):
     # used to store/share parameters to reconstruct the model
     layer_config = {
-        "filters": self._filters,
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_moment": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._conv_activation,
-        "leaky_alpha": self._leaky_alpha,
-        "sc_activation": self._sc_activation,
-        "downsample": self._downsample
+        'filters': self._filters,
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'kernel_regularizer': self._kernel_regularizer,
+        'dilation_rate': self._dilation_rate,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._conv_activation,
+        'leaky_alpha': self._leaky_alpha,
+        'sc_activation': self._sc_activation,
+        'downsample': self._downsample,
     }
     layer_config.update(super().get_config())
     return layer_config
 
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class CSPTiny(tf.keras.layers.Layer):
-  """A Small size convolution block proposed in the CSPNet.
-
-  The layer uses shortcuts, routing(concatnation), and feature grouping
-  in order to improve gradient variablity and allow for high efficency, low
-  power residual learning for small networtf.keras.
-
+  """
+  A Small size convolution block proposed in the CSPNet. The layer uses
+  shortcuts, routing(concatnation), and feature grouping in order to improve
+  gradient variablity and allow for high efficency, low power residual learning
+  for small networtf.keras.
   Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu, Ping-Yang
-  Chen, Jun-Wei Hsieh
+  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
+        Ping-Yang Chen, Jun-Wei Hsieh
       CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-      arXiv:1911.11929
+        arXiv:1911.11929
   """
 
   def __init__(self,
                filters=1,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                bias_regularizer=None,
+               kernel_regularizer=None,
                use_bn=True,
+               dilation_rate=1,
                use_sync_bn=False,
                group_id=1,
                groups=2,
                norm_momentum=0.99,
                norm_epsilon=0.001,
-               activation="leaky",
+               activation='leaky',
                downsample=True,
                leaky_alpha=0.1,
                **kwargs):
-    """Initializes CSPTiny.
-
+    """
     Args:
-      filters: integer for output depth, or the number of features to learn
+      filters: integer for output depth, or the number of features to learn.
       kernel_initializer: string to indicate which function to use to initialize
-        weights
+        weights.
       bias_initializer: string to indicate which function to use to initialize
-        bias
+        bias.
+      use_bn: boolean for whether to use batch normalization.
       kernel_regularizer: string to indicate which function to use to
-        regularizer weights
+        regularizer weights.
       bias_regularizer: string to indicate which function to use to regularizer
-        bias
-      use_bn: boolean for whether to use batch normalization
-      use_sync_bn: boolean for whether sync batch normalization statistics of
-        all batch norm layers to the models global statistics (across all input
-        batches)
-      group_id: integer for which group of features to pass through the csp tiny
-        stack.
+        bias.
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      group_id: integer for which group of features to pass through the csp
+        tiny stack.
       groups: integer for how many splits there should be in the convolution
-        feature stack output
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
-      activation: string or None for activation function to use in layer,
-        if None activation is replaced by linear
+        feature stack output.
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      conv_activation: string or None for activation function to use in layer,
+        if None activation is replaced by linear.
+      leaky_alpha: float to use as alpha if activation function is leaky.
+      sc_activation: string for activation function to use in layer.
       downsample: boolean for if image input is larger than layer output, set
-        downsample to True so the dimensions are forced to match
-      leaky_alpha: float to use as alpha if activation function is leaky
-      **kwargs: Keyword Arguments
+        downsample to True so the dimensions are forced to match.
+      **kwargs: Keyword Arguments.
     """
 
     # ConvBN params
@@ -396,6 +416,7 @@ def __init__(self,
     self._bias_initializer = bias_initializer
     self._bias_regularizer = bias_regularizer
     self._use_bn = use_bn
+    self._dilation_rate = dilation_rate
     self._use_sync_bn = use_sync_bn
     self._kernel_regularizer = kernel_regularizer
     self._groups = groups
@@ -403,7 +424,7 @@ def __init__(self,
     self._downsample = downsample
 
     # normal params
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
 
     # activation params
@@ -413,37 +434,37 @@ def __init__(self,
     super().__init__(**kwargs)
 
   def build(self, input_shape):
-    self._dark_conv_args = {
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_momentum": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._conv_activation,
-        "kernel_regularizer": self._kernel_regularizer,
-        "leaky_alpha": self._leaky_alpha
+    dark_conv_args = {
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._conv_activation,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha
     }
     self._convlayer1 = ConvBN(
         filters=self._filters,
         kernel_size=(3, 3),
         strides=(1, 1),
-        padding="same",
-        **self._dark_conv_args)
+        padding='same',
+        **dark_conv_args)
 
     self._convlayer2 = ConvBN(
         filters=self._filters // 2,
         kernel_size=(3, 3),
         strides=(1, 1),
-        padding="same",
+        padding='same',
         kernel_initializer=self._kernel_initializer,
         bias_initializer=self._bias_initializer,
         bias_regularizer=self._bias_regularizer,
         kernel_regularizer=self._kernel_regularizer,
         use_bn=self._use_bn,
         use_sync_bn=self._use_sync_bn,
-        norm_momentum=self._norm_moment,
+        norm_momentum=self._norm_momentum,
         norm_epsilon=self._norm_epsilon,
         activation=self._conv_activation,
         leaky_alpha=self._leaky_alpha)
@@ -452,22 +473,23 @@ def build(self, input_shape):
         filters=self._filters // 2,
         kernel_size=(3, 3),
         strides=(1, 1),
-        padding="same",
-        **self._dark_conv_args)
+        padding='same',
+        **dark_conv_args)
 
     self._convlayer4 = ConvBN(
         filters=self._filters,
         kernel_size=(1, 1),
         strides=(1, 1),
-        padding="same",
-        **self._dark_conv_args)
+        padding='same',
+        **dark_conv_args)
 
-    self._maxpool = tf.keras.layers.MaxPool2D(
-        pool_size=2, strides=2, padding="same", data_format=None)
+    if self._downsample:
+      self._maxpool = tf.keras.layers.MaxPool2D(
+          pool_size=2, strides=2, padding='same', data_format=None)
 
     super().build(input_shape)
 
-  def call(self, inputs):
+  def call(self, inputs, training=None):
     x1 = self._convlayer1(inputs)
     x1_group = tf.split(x1, self._groups, axis=-1)[self._group_id]
     x2 = self._convlayer2(x1_group)  # grouping
@@ -479,74 +501,55 @@ def call(self, inputs):
       x = self._maxpool(x)
     return x, x5
 
-  def get_config(self):
-    # used to store/share parameters to reconsturct the model
-    layer_config = {
-        "filters": self._filters,
-        "strides": self._strides,
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "kernel_regularizer": self._kernel_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_moment": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._conv_activation,
-        "leaky_alpha": self._leaky_alpha,
-        "sc_activation": self._sc_activation,
-    }
-    layer_config.update(super().get_config())
-    return layer_config
-
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
+@tf.keras.utils.register_keras_serializable(package='yolo')
 class CSPRoute(tf.keras.layers.Layer):
-  """Down sampling layer to take the place of down sampleing.
-
-  It is applied in Residual networks. This is the first of 2 layers needed to
-  convert any Residual Network model to a CSPNet. At the start of a new level
-  change, this CSPRoute layer creates a learned identity that will act as a
-  cross stage connection, that is used to inform the inputs to the next stage.
-  It is called cross stage partial because the number of filters required in
-  every intermitent Residual layer is reduced by half. The sister layer will
-  take the partial generated by this layer and concatnate it with the output of
-  the final residual layer in the stack to create a fully feature level output.
-  This concatnation merges the partial blocks of 2 levels as input to the next
-  allowing the gradients of each level to be more unique, and reducing the
-  number of parameters required by each level by 50% while keeping accuracy
-  consistent.
+  """
+  Down sampling layer to take the place of down sampleing done in Residual
+  networks. This is the first of 2 layers needed to convert any Residual Network
+  model to a CSPNet. At the start of a new level change, this CSPRoute layer
+  creates a learned identity that will act as a cross stage connection,
+  that is used to inform the inputs to the next stage. It is called cross stage
+  partial because the number of filters required in every intermitent Residual
+  layer is reduced by half. The sister layer will take the partial generated by
+  this layer and concatnate it with the output of the final residual layer in
+  the stack to create a fully feature level output. This concatnation merges the
+  partial blocks of 2 levels as input to the next allowing the gradients of each
+  level to be more unique, and reducing the number of parameters required by
+  each level by 50% while keeping accuracy consistent.
 
   Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu, Ping-Yang
-      Chen, Jun-Wei Hsieh.
+  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
+        Ping-Yang Chen, Jun-Wei Hsieh
       CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-      arXiv:1911.11929
+        arXiv:1911.11929
   """
 
   def __init__(self,
                filters,
                filter_scale=2,
-               activation="mish",
-               downsample=True,
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
+               activation='mish',
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                bias_regularizer=None,
+               kernel_regularizer=None,
+               dilation_rate=1,
                use_bn=True,
                use_sync_bn=False,
                norm_momentum=0.99,
                norm_epsilon=0.001,
+               downsample=True,
+               leaky_alpha=0.1,
                **kwargs):
-    """Initializes CSPRoute.
-
+    """
     Args:
       filters: integer for output depth, or the number of features to learn
       filter_scale: integer dicating (filters//2) or the number of filters in
         the partial feature stack.
-      activation: string for activation function to use in layer
       downsample: down_sample the input.
-      kernel_initializer: string to indicate which function to use to initialize
-        weights.
+      activation: string for activation function to use in layer.
+      kernel_initializer: string to indicate which function to use to
+        initialize weights.
       bias_initializer: string to indicate which function to use to initialize
         bias.
       kernel_regularizer: string to indicate which function to use to
@@ -554,99 +557,117 @@ def __init__(self,
       bias_regularizer: string to indicate which function to use to regularizer
         bias.
       use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization.
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
-      **kwargs: Keyword Arguments
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      **kwargs: Keyword Arguments.
     """
 
     super().__init__(**kwargs)
-    # Layer params.
+    # layer params
     self._filters = filters
     self._filter_scale = filter_scale
     self._activation = activation
 
-    # Convoultion params.
+    # convoultion params
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     self._kernel_regularizer = kernel_regularizer
     self._bias_regularizer = bias_regularizer
+    self._dilation_rate = dilation_rate
     self._use_bn = use_bn
     self._use_sync_bn = use_sync_bn
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
     self._downsample = downsample
+    self._leaky_alpha = leaky_alpha
 
   def build(self, input_shape):
-    self._dark_conv_args = {
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_momentum": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._activation,
-        "kernel_regularizer": self._kernel_regularizer,
+    dark_conv_args = {
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._activation,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha,
     }
     if self._downsample:
-      self._conv1 = ConvBN(filters=self._filters,
-                           kernel_size=(3, 3),
-                           strides=(2, 2),
-                           **self._dark_conv_args)
-    else:
-      self._conv1 = ConvBN(filters=self._filters,
-                           kernel_size=(3, 3),
-                           strides=(1, 1),
-                           **self._dark_conv_args)
-    self._conv2 = ConvBN(filters=self._filters // self._filter_scale,
-                         kernel_size=(1, 1),
-                         strides=(1, 1),
-                         **self._dark_conv_args)
-
-    self._conv3 = ConvBN(filters=self._filters // self._filter_scale,
-                         kernel_size=(1, 1),
-                         strides=(1, 1),
-                         **self._dark_conv_args)
+      if self._dilation_rate > 1:
+        dilation_rate = 1
+        if self._dilation_rate // 2 > 0:
+          dilation_rate = self._dilation_rate // 2
+        down_stride = 1
+      else:
+        dilation_rate = 1
+        down_stride = 2
 
-  def call(self, inputs):
-    x = self._conv1(inputs)
-    y = self._conv2(x)
-    x = self._conv3(x)
-    return (x, y)
+      self._conv1 = ConvBN(
+          filters=self._filters,
+          kernel_size=(3, 3),
+          strides=down_stride,
+          dilation_rate=dilation_rate,
+          **dark_conv_args)
 
+    self._conv2 = ConvBN(
+        filters=self._filters // self._filter_scale,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        **dark_conv_args)
 
-@tf.keras.utils.register_keras_serializable(package="yolo")
-class CSPConnect(tf.keras.layers.Layer):
-  """Sister Layer to the CSPRoute layer.
+    self._conv3 = ConvBN(
+        filters=self._filters // self._filter_scale,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        **dark_conv_args)
+
+  def call(self, inputs, training=None):
+    if self._downsample:
+      inputs = self._conv1(inputs)
+    y = self._conv2(inputs)
+    x = self._conv3(inputs)
+    return (x, y)
 
-  Merges the partial feature stacks generated by the CSPDownsampling layer,
-  and the finaly output of the residual stack. Suggested in the CSPNet paper.
 
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class CSPConnect(tf.keras.layers.Layer):
+  """
+  Sister Layer to the CSPRoute layer. Merges the partial feature stacks
+  generated by the CSPDownsampling layer, and the finaly output of the
+  residual stack. Suggested in the CSPNet paper.
   Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu, Ping-Yang
-      Chen, Jun-Wei Hsieh.
+  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
+        Ping-Yang Chen, Jun-Wei Hsieh
       CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-      arXiv:1911.11929
+        arXiv:1911.11929
   """
 
   def __init__(self,
                filters,
                filter_scale=2,
-               activation="mish",
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
+               drop_final=False,
+               drop_first=False,
+               activation='mish',
+               kernel_size=(1, 1),
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                bias_regularizer=None,
+               kernel_regularizer=None,
+               dilation_rate=1,
                use_bn=True,
                use_sync_bn=False,
                norm_momentum=0.99,
                norm_epsilon=0.001,
+               leaky_alpha=0.1,
                **kwargs):
-    """Initializes CSPConnect.
-
+    """
     Args:
-      filters: integer for output depth, or the number of features to learn.
+      filters: integer for output depth, or the number of features to learn
       filter_scale: integer dicating (filters//2) or the number of filters in
         the partial feature stack.
       activation: string for activation function to use in layer.
@@ -659,96 +680,112 @@ def __init__(self,
       bias_regularizer: string to indicate which function to use to regularizer
         bias.
       use_bn: boolean for whether to use batch normalization.
-      use_sync_bn: boolean for whether sync batch normalization.
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
-      **kwargs: Keyword Arguments
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global
+        statistics (across all input batches).
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      **kwargs: Keyword Arguments.
     """
+
     super().__init__(**kwargs)
-    # layer params.
+    # layer params
     self._filters = filters
     self._filter_scale = filter_scale
     self._activation = activation
 
-    # Convoultion params.
+    # convoultion params
+    self._kernel_size = kernel_size
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     self._kernel_regularizer = kernel_regularizer
     self._bias_regularizer = bias_regularizer
     self._use_bn = use_bn
     self._use_sync_bn = use_sync_bn
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
+    self._drop_final = drop_final
+    self._drop_first = drop_first
+    self._leaky_alpha = leaky_alpha
 
   def build(self, input_shape):
-    self._dark_conv_args = {
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_momentum": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "activation": self._activation,
-        "kernel_regularizer": self._kernel_regularizer,
+    dark_conv_args = {
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._activation,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha,
     }
-    self._conv1 = ConvBN(filters=self._filters // self._filter_scale,
-                         kernel_size=(1, 1),
-                         strides=(1, 1),
-                         **self._dark_conv_args)
+    if not self._drop_first:
+      self._conv1 = ConvBN(
+          filters=self._filters // self._filter_scale,
+          kernel_size=self._kernel_size,
+          strides=(1, 1),
+          **dark_conv_args)
     self._concat = tf.keras.layers.Concatenate(axis=-1)
-    self._conv2 = ConvBN(filters=self._filters,
-                         kernel_size=(1, 1),
-                         strides=(1, 1),
-                         **self._dark_conv_args)
 
-  def call(self, inputs):
+    if not self._drop_final:
+      self._conv2 = ConvBN(
+          filters=self._filters,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          **dark_conv_args)
+
+  def call(self, inputs, training=None):
     x_prev, x_csp = inputs
-    x = self._conv1(x_prev)
-    x = self._concat([x, x_csp])
-    x = self._conv2(x)
+    if not self._drop_first:
+      x_prev = self._conv1(x_prev)
+    x = self._concat([x_prev, x_csp])
+
+    # skipped if drop final is true
+    if not self._drop_final:
+      x = self._conv2(x)
     return x
 
 
 class CSPStack(tf.keras.layers.Layer):
-  """CSP full stack.
-
-  Combines the route and the connect in case you dont want to just quickly wrap
-  an existing callable or list of layers to make it a cross stage partial.
-  Added for ease of use. you should be able to wrap any layer stack with a CSP
-  independent of wether it belongs to the Darknet family. if filter_scale = 2,
-  then the blocks in the stack passed into the the CSP stack should also have
-  filters = filters/filter_scale.
-
+  """
+  CSP full stack, combines the route and the connect in case you dont want to
+  jsut quickly wrap an existing callable or list of layers to
+  make it a cross stage partial. Added for ease of use. you should be able
+  to wrap any layer stack with a CSP independent of wether it belongs
+  to the Darknet family. if filter_scale = 2, then the blocks in the stack
+  passed into the the CSP stack should also have filters = filters/filter_scale
   Cross Stage Partial networks (CSPNets) were proposed in:
-  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu, Ping-Yang
-      Chen, Jun-Wei Hsieh
+
+  [1] Chien-Yao Wang, Hong-Yuan Mark Liao, I-Hau Yeh, Yueh-Hua Wu,
+        Ping-Yang Chen, Jun-Wei Hsieh
       CSPNet: A New Backbone that can Enhance Learning Capability of CNN.
-      arXiv:1911.11929
+        arXiv:1911.11929
   """
 
   def __init__(self,
                filters,
                model_to_wrap=None,
                filter_scale=2,
-               activation="mish",
-               kernel_initializer="glorot_uniform",
-               bias_initializer="zeros",
-               kernel_regularizer=None,
+               activation='mish',
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
                bias_regularizer=None,
+               kernel_regularizer=None,
                downsample=True,
                use_bn=True,
                use_sync_bn=False,
                norm_momentum=0.99,
                norm_epsilon=0.001,
                **kwargs):
-    """Initializes CSPStack.
-
+    """
     Args:
-      filters: integer for output depth, or the number of features to learn.
       model_to_wrap: callable Model or a list of callable objects that will
-        process the output of CSPRoute, and be input into CSPConnect. List will
-        be called sequentially.
+        process the output of CSPRoute, and be input into CSPConnect.
+        list will be called sequentially.
+      downsample: down_sample the input.
+      filters: integer for output depth, or the number of features to learn.
       filter_scale: integer dicating (filters//2) or the number of filters in
         the partial feature stack.
       activation: string for activation function to use in layer.
@@ -760,62 +797,793 @@ def __init__(self,
         regularizer weights.
       bias_regularizer: string to indicate which function to use to regularizer
         bias.
-      downsample: down_sample the input.
-      use_bn: boolean for whether to use batch normalization
-      use_sync_bn: boolean for whether sync batch normalization.
-      norm_momentum: float for moment to use for batch normalization
-      norm_epsilon: float for batch normalization epsilon
-      **kwargs: Keyword Arguments
+      use_bn: boolean for whether to use batch normalization.
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      **kwargs: Keyword Arguments.
+
+    Raises:
+      TypeError: model_to_wrap is not a layer or a list of layers
     """
+
     super().__init__(**kwargs)
-    # Layer params.
+    # layer params
     self._filters = filters
     self._filter_scale = filter_scale
     self._activation = activation
     self._downsample = downsample
 
-    # Convoultion params.
+    # convoultion params
     self._kernel_initializer = kernel_initializer
     self._bias_initializer = bias_initializer
     self._kernel_regularizer = kernel_regularizer
     self._bias_regularizer = bias_regularizer
     self._use_bn = use_bn
     self._use_sync_bn = use_sync_bn
-    self._norm_moment = norm_momentum
+    self._norm_momentum = norm_momentum
     self._norm_epsilon = norm_epsilon
 
-    if model_to_wrap is not None:
-      if isinstance(model_to_wrap, Callable):
-        self._model_to_wrap = [model_to_wrap]
-      elif isinstance(model_to_wrap, List):
-        self._model_to_wrap = model_to_wrap
-      else:
-        raise ValueError("The input to the CSPStack must be a list of layers"
-                         "that we can iterate through, or \n a callable")
-    else:
+    if model_to_wrap is None:
       self._model_to_wrap = []
+    elif isinstance(model_to_wrap, Callable):
+      self._model_to_wrap = [model_to_wrap]
+    elif isinstance(model_to_wrap, List):
+      self._model_to_wrap = model_to_wrap
+    else:
+      raise TypeError(
+          'the input to the CSPStack must be a list of layers that we can' +
+          'iterate through, or \n a callable')
 
   def build(self, input_shape):
-    self._dark_conv_args = {
-        "filters": self._filters,
-        "filter_scale": self._filter_scale,
-        "activation": self._activation,
-        "kernel_initializer": self._kernel_initializer,
-        "bias_initializer": self._bias_initializer,
-        "bias_regularizer": self._bias_regularizer,
-        "use_bn": self._use_bn,
-        "use_sync_bn": self._use_sync_bn,
-        "norm_momentum": self._norm_moment,
-        "norm_epsilon": self._norm_epsilon,
-        "kernel_regularizer": self._kernel_regularizer,
+    dark_conv_args = {
+        'filters': self._filters,
+        'filter_scale': self._filter_scale,
+        'activation': self._activation,
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'kernel_regularizer': self._kernel_regularizer,
     }
-    self._route = CSPRoute(downsample=self._downsample, **self._dark_conv_args)
-    self._connect = CSPConnect(**self._dark_conv_args)
-    return
+    self._route = CSPRoute(downsample=self._downsample, **dark_conv_args)
+    self._connect = CSPConnect(**dark_conv_args)
 
-  def call(self, inputs):
+  def call(self, inputs, training=None):
     x, x_route = self._route(inputs)
     for layer in self._model_to_wrap:
       x = layer(x)
     x = self._connect([x, x_route])
     return x
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class PathAggregationBlock(tf.keras.layers.Layer):
+
+  def __init__(self,
+               filters=1,
+               drop_final=True,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               bias_regularizer=None,
+               kernel_regularizer=None,
+               use_bn=True,
+               use_sync_bn=False,
+               inverted=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               activation='leaky',
+               leaky_alpha=0.1,
+               downsample=False,
+               upsample=False,
+               upsample_size=2,
+               **kwargs):
+    """
+    Args:
+      filters: integer for output depth, or the number of features to learn.
+      drop_final: do not create the last convolution block.
+      kernel_initializer: string to indicate which function to use to initialize
+        weights.
+      bias_initializer: string to indicate which function to use to initialize
+        bias.
+      kernel_regularizer: string to indicate which function to use to
+        regularizer weights.
+      bias_regularizer: string to indicate which function to use to regularizer
+        bias.
+      use_bn: boolean for whether to use batch normalization.
+      use_sync_bn: boolean for whether sync batch normalization statistics
+        of all batch norm layers to the models global statistics
+        (across all input batches).
+      inverted: boolean for inverting the order of the convolutions.
+      norm_momentum: float for moment to use for batch normalization.
+      norm_epsilon: float for batch normalization epsilon.
+      activation: string or None for activation function to use in layer,
+        if None activation is replaced by linear.
+      leaky_alpha: float to use as alpha if activation function is leaky.
+      downsample: `bool` for whehter to downwample and merge.
+      upsample: `bool` for whehter to upsample and merge.
+      upsample_size: `int` how much to upsample in order to match shapes.
+      **kwargs: Keyword Arguments.
+    """
+
+    # darkconv params
+    self._filters = filters
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+    self._bias_regularizer = bias_regularizer
+    self._kernel_regularizer = kernel_regularizer
+    self._use_bn = use_bn
+    self._use_sync_bn = use_sync_bn
+
+    # normal params
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    # activation params
+    self._conv_activation = activation
+    self._leaky_alpha = leaky_alpha
+    self._downsample = downsample
+    self._upsample = upsample
+    self._upsample_size = upsample_size
+    self._drop_final = drop_final
+
+    #block params
+    self._inverted = inverted
+
+    super().__init__(**kwargs)
+
+  def _build_regular(self, input_shape, kwargs):
+    if self._downsample:
+      self._conv = ConvBN(
+          filters=self._filters,
+          kernel_size=(3, 3),
+          strides=(2, 2),
+          padding='same',
+          **kwargs)
+    else:
+      self._conv = ConvBN(
+          filters=self._filters,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          **kwargs)
+
+    if not self._drop_final:
+      self._conv_concat = ConvBN(
+          filters=self._filters,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          **kwargs)
+
+  def _build_reversed(self, input_shape, kwargs):
+    if self._downsample:
+      self._conv_prev = ConvBN(
+          filters=self._filters,
+          kernel_size=(3, 3),
+          strides=(2, 2),
+          padding='same',
+          **kwargs)
+    else:
+      self._conv_prev = ConvBN(
+          filters=self._filters,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          **kwargs)
+
+    self._conv_route = ConvBN(
+        filters=self._filters,
+        kernel_size=(1, 1),
+        strides=(1, 1),
+        padding='same',
+        **kwargs)
+
+    if not self._drop_final:
+      self._conv_sync = ConvBN(
+          filters=self._filters,
+          kernel_size=(1, 1),
+          strides=(1, 1),
+          padding='same',
+          **kwargs)
+
+  def build(self, input_shape):
+    dark_conv_args = {
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_bn': self._use_bn,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'activation': self._conv_activation,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha,
+    }
+
+    if self._inverted:
+      self._build_reversed(input_shape, dark_conv_args)
+    else:
+      self._build_regular(input_shape, dark_conv_args)
+
+    self._concat = tf.keras.layers.Concatenate()
+    super().build(input_shape)
+
+  def _call_regular(self, inputs, training=None):
+    input_to_convolve, input_to_concat = inputs
+    x_prev = self._conv(input_to_convolve)
+    if self._upsample:
+      x_prev = spatial_transform_ops.nearest_upsampling(x_prev,
+                                                        self._upsample_size)
+    x = self._concat([x_prev, input_to_concat])
+
+    # used in csp conversion
+    if not self._drop_final:
+      x = self._conv_concat(x)
+    return x_prev, x
+
+  def _call_reversed(self, inputs, training=None):
+    x_route, x_prev = inputs
+    x_prev = self._conv_prev(x_prev)
+    if self._upsample:
+      x_prev = spatial_transform_ops.nearest_upsampling(x_prev,
+                                                        self._upsample_size)
+    x_route = self._conv_route(x_route)
+    x = self._concat([x_route, x_prev])
+    if not self._drop_final:
+      x = self._conv_sync(x)
+    return x_prev, x
+
+  def call(self, inputs, training=None):
+    # done this way to prevent confusion in the auto graph
+    if self._inverted:
+      return self._call_reversed(inputs, training=training)
+    else:
+      return self._call_regular(inputs, training=training)
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class SPP(tf.keras.layers.Layer):
+  """
+  a non-agregated SPP layer that uses Pooling to gain more performance
+  """
+
+  def __init__(self, sizes, **kwargs):
+    self._sizes = list(reversed(sizes))
+    if len(sizes) == 0:
+      raise ValueError('More than one maxpool should be specified in SSP block')
+    super().__init__(**kwargs)
+
+  def build(self, input_shape):
+    maxpools = []
+    for size in self._sizes:
+      maxpools.append(
+          tf.keras.layers.MaxPool2D(
+              pool_size=(size, size),
+              strides=(1, 1),
+              padding='same',
+              data_format=None))
+    self._maxpools = maxpools
+    super().build(input_shape)
+
+  def call(self, inputs, training=None):
+    outputs = []
+    for maxpool in self._maxpools:
+      outputs.append(maxpool(inputs))
+    outputs.append(inputs)
+    concat_output = tf.keras.layers.concatenate(outputs)
+    return concat_output
+
+  def get_config(self):
+    layer_config = {'sizes': self._sizes}
+    layer_config.update(super().get_config())
+    return layer_config
+
+
+class SAM(tf.keras.layers.Layer):
+  """
+  [1] Sanghyun Woo, Jongchan Park, Joon-Young Lee, In So Kweon
+  CBAM: Convolutional Block Attention Module. arXiv:1807.06521
+
+  implementation of the Spatial Attention Model (SAM)
+  """
+
+  def __init__(self,
+               use_pooling=False,
+               filter_match=False,
+               filters=1,
+               kernel_size=(1, 1),
+               strides=(1, 1),
+               padding='same',
+               dilation_rate=(1, 1),
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               bias_regularizer=None,
+               kernel_regularizer=None,
+               use_bn=True,
+               use_sync_bn=True,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               activation='sigmoid',
+               output_activation=None,
+               leaky_alpha=0.1,
+               **kwargs):
+
+    # use_pooling
+    self._use_pooling = use_pooling
+    self._filters = filters
+    self._output_activation = output_activation
+    self._leaky_alpha = leaky_alpha
+
+    self.dark_conv_args = {
+        'kernel_size': kernel_size,
+        'strides': strides,
+        'padding': padding,
+        'dilation_rate': dilation_rate,
+        'kernel_initializer': kernel_initializer,
+        'bias_initializer': bias_initializer,
+        'bias_regularizer': bias_regularizer,
+        'use_bn': use_bn,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'leaky_alpha': leaky_alpha
+    }
+
+    super().__init__(**kwargs)
+
+  def build(self, input_shape):
+    if self._filters == -1:
+      self._filters = input_shape[-1]
+    self._conv = ConvBN(filters=self._filters, **self.dark_conv_args)
+    if self._output_activation == 'leaky':
+      self._activation_fn = tf.keras.layers.LeakyReLU(alpha=self._leaky_alpha)
+    elif self._output_activation == 'mish':
+      self._activation_fn = lambda x: x * tf.math.tanh(tf.math.softplus(x))
+    else:
+      self._activation_fn = tf_utils.get_activation(self._output_activation)
+
+  def call(self, inputs, training=None):
+    if self._use_pooling:
+      depth_max = tf.reduce_max(inputs, axis=-1, keep_dims=True)
+      depth_avg = tf.reduce_mean(inputs, axis=-1, keep_dims=True)
+      input_maps = tf.concat([depth_avg, depth_max], axis=-1)
+    else:
+      input_maps = inputs
+
+    attention_mask = self._conv(input_maps)
+    return self._activation_fn(inputs * attention_mask)
+
+
+class CAM(tf.keras.layers.Layer):
+  """
+  [1] Sanghyun Woo, Jongchan Park, Joon-Young Lee, In So Kweon
+  CBAM: Convolutional Block Attention Module. arXiv:1807.06521
+
+  implementation of the Channel Attention Model (CAM)
+  """
+
+  def __init__(self,
+               reduction_ratio=1.0,
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               bias_regularizer=None,
+               kernel_regularizer=None,
+               use_bn=False,
+               use_sync_bn=False,
+               use_bias=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               mlp_activation='linear',
+               activation='sigmoid',
+               leaky_alpha=0.1,
+               **kwargs):
+
+    self._reduction_ratio = reduction_ratio
+
+    # use_pooling
+    if use_sync_bn:
+      self._bn = tf.keras.layers.experimental.SyncBatchNormalization
+    else:
+      self._bn = tf.keras.layers.BatchNormalization
+
+    if not use_bn:
+      self._bn = Identity
+      self._bn_args = {}
+    else:
+      self._bn_args = {
+          'momentum': norm_momentum,
+          'epsilon': norm_epsilon,
+      }
+
+    self._mlp_args = {
+        'use_bias': use_bias,
+        'kernel_initializer': kernel_initializer,
+        'bias_initializer': bias_initializer,
+        'bias_regularizer': bias_regularizer,
+        'activation': mlp_activation,
+        'kernel_regularizer': kernel_regularizer,
+    }
+
+    self._leaky_alpha = leaky_alpha
+    self._activation = activation
+
+    super().__init__(**kwargs)
+
+  def build(self, input_shape):
+    self._filters = input_shape[-1]
+
+    self._mlp = tf.keras.Sequential([
+        tf.keras.layers.Dense(self._filters, **self._mlp_args),
+        self._bn(**self._bn_args),
+        tf.keras.layers.Dense(
+            int(self._filters * self._reduction_ratio), **self._mlp_args),
+        self._bn(**self._bn_args),
+        tf.keras.layers.Dense(self._filters, **self._mlp_args),
+        self._bn(**self._bn_args),
+    ])
+
+    if self._activation == 'leaky':
+      self._activation_fn = tf.keras.layers.LeakyReLU(alpha=self._leaky_alpha)
+    elif self._activation == 'mish':
+      self._activation_fn = lambda x: x * tf.math.tanh(tf.math.softplus(x))
+    else:
+      self._activation_fn = tf_utils.get_activation(self._activation)
+
+  def call(self, inputs, training=None):
+    depth_max = self._mlp(tf.reduce_max(inputs, axis=(1, 2)))
+    depth_avg = self._mlp(tf.reduce_mean(inputs, axis=(1, 2)))
+    channel_mask = self._activation_fn(depth_avg + depth_max)
+
+    channel_mask = tf.expand_dims(channel_mask, axis=1)
+    attention_mask = tf.expand_dims(channel_mask, axis=1)
+
+    return inputs * attention_mask
+
+
+class CBAM(tf.keras.layers.Layer):
+  """
+  [1] Sanghyun Woo, Jongchan Park, Joon-Young Lee, In So Kweon
+  CBAM: Convolutional Block Attention Module. arXiv:1807.06521
+
+  implementation of the Convolution Block Attention Module (CBAM)
+  """
+
+  def __init__(self,
+               use_pooling=False,
+               filters=1,
+               reduction_ratio=1.0,
+               kernel_size=(1, 1),
+               strides=(1, 1),
+               padding='same',
+               dilation_rate=(1, 1),
+               kernel_initializer='glorot_uniform',
+               bias_initializer='zeros',
+               bias_regularizer=None,
+               kernel_regularizer=None,
+               use_bn=True,
+               use_sync_bn=False,
+               norm_momentum=0.99,
+               norm_epsilon=0.001,
+               mlp_activation=None,
+               activation='sigmoid',
+               leaky_alpha=0.1,
+               **kwargs):
+
+    # use_pooling
+
+    self._sam_args = {
+        'use_pooling': use_pooling,
+        'filters': filters,
+        'kernel_size': kernel_size,
+        'strides': strides,
+        'padding': padding,
+        'dilation_rate': dilation_rate,
+    }
+
+    self._cam_args = {
+        'reduction_ratio': reduction_ratio,
+        'mlp_activation': mlp_activation
+    }
+
+    self._common_args = {
+        'kernel_initializer': kernel_initializer,
+        'bias_initializer': bias_initializer,
+        'bias_regularizer': bias_regularizer,
+        'use_bn': use_bn,
+        'use_sync_bn': use_sync_bn,
+        'norm_momentum': norm_momentum,
+        'norm_epsilon': norm_epsilon,
+        'activation': activation,
+        'kernel_regularizer': kernel_regularizer,
+        'leaky_alpha': leaky_alpha
+    }
+
+    self._cam_args.update(self._common_args)
+    self._sam_args.update(self._common_args)
+    super().__init__(**kwargs)
+
+  def build(self, input_shape):
+    self._cam = CAM(**self._cam_args)
+    self._sam = SAM(**self._sam_args)
+
+  def call(self, inputs, training=None):
+    return self._sam(self._cam(inputs))
+
+
+@tf.keras.utils.register_keras_serializable(package='yolo')
+class DarkRouteProcess(tf.keras.layers.Layer):
+  """
+  process darknet outputs and connect back bone to head more generalizably
+  Abstracts repetition of DarkConv objects that is common in YOLO.
+
+  It is used like the following:
+
+  x = ConvBN(1024, (3, 3), (1, 1))(x)
+  proc = DarkRouteProcess(filters = 1024,
+                          repetitions = 3,
+                          insert_spp = False)(x)
+  """
+  def __init__(
+      self,
+      filters=2,
+      repetitions=2,
+      insert_spp=False,
+      insert_sam=False,
+      insert_cbam=False,
+      csp_stack=0,
+      csp_scale=2,
+      kernel_initializer='glorot_uniform',
+      bias_initializer='zeros',
+      bias_regularizer=None,
+      use_sync_bn=False,
+      kernel_regularizer=None,  # default find where is it is stated
+      norm_momentum=0.99,
+      norm_epsilon=0.001,
+      block_invert=False,
+      activation='leaky',
+      leaky_alpha=0.1,
+      spp_keys=None,
+      **kwargs):
+    """
+    Args:
+      filters: the number of filters to be used in all subsequent layers
+        filters should be the depth of the tensor input into this layer,
+        as no downsampling can be done within this layer object.
+      repetitions: number of times to repeat the processign nodes
+        for tiny: 1 repition, no spp allowed
+        for spp: insert_spp = True, and allow for 3+ repetitions
+        for regular: insert_spp = False, and allow for 3+ repetitions.
+      insert_spp: bool if true add the spatial pyramid pooling layer.
+      kernel_initializer: method to use to initializa kernel weights.
+      bias_initializer: method to use to initialize the bias of the conv
+        layers.
+      norm_momentum: batch norm parameter see TensorFlow documentation.
+      norm_epsilon: batch norm parameter see TensorFlow documentation.
+      activation: activation function to use in processing.
+      leaky_alpha: if leaky acitivation function, the alpha to use in
+        processing the relu input.
+
+    Returns:
+      callable tensorflow layer
+
+    Raises:
+      None
+    """
+
+    super().__init__(**kwargs)
+    # darkconv params
+    self._filters = filters
+    self._use_sync_bn = use_sync_bn
+    self._kernel_initializer = kernel_initializer
+    self._bias_initializer = bias_initializer
+    self._bias_regularizer = bias_regularizer
+    self._kernel_regularizer = kernel_regularizer
+
+    # normal params
+    self._norm_momentum = norm_momentum
+    self._norm_epsilon = norm_epsilon
+
+    # activation params
+    self._activation = activation
+    self._leaky_alpha = leaky_alpha
+
+    repetitions += (2 * int(insert_spp))
+    if repetitions == 1:
+      block_invert = True
+
+    self._repetitions = repetitions
+    self.layer_list, self.outputs = self._get_base_layers()
+
+    if csp_stack > 0:
+      self._csp_scale = csp_scale
+      csp_stack += (2 * int(insert_spp))
+      self._csp_filters = lambda x: x // csp_scale
+      self._convert_csp(self.layer_list, self.outputs, csp_stack)
+      block_invert = False
+
+    self._csp_stack = csp_stack
+
+    if block_invert:
+      self._conv1_filters = lambda x: x
+      self._conv2_filters = lambda x: x // 2
+      self._conv1_kernel = (3, 3)
+      self._conv2_kernel = (1, 1)
+    else:
+      self._conv1_filters = lambda x: x // 2
+      self._conv2_filters = lambda x: x
+      self._conv1_kernel = (1, 1)
+      self._conv2_kernel = (3, 3)
+
+    # insert SPP will always add to the total nuber of layer, never replace
+    if insert_spp:
+      self._spp_keys = spp_keys if spp_keys is not None else [5, 9, 13]
+      self.layer_list = self._insert_spp(self.layer_list)
+
+    if repetitions > 1:
+      self.outputs[-2] = True
+
+    if insert_sam:
+      self.layer_list = self._insert_sam(self.layer_list, self.outputs)
+      self._repetitions += 1
+    self.outputs[-1] = True
+
+  def _get_base_layers(self):
+    layer_list = []
+    outputs = []
+    for i in range(self._repetitions):
+      layers = ['conv1'] * ((i + 1) % 2) + ['conv2'] * (i % 2)
+      layer_list.extend(layers)
+      outputs = [False] + outputs
+    return layer_list, outputs
+
+  def _insert_spp(self, layer_list):
+    if len(layer_list) <= 3:
+      layer_list[1] = 'spp'
+    else:
+      layer_list[3] = 'spp'
+    return layer_list
+
+  def _convert_csp(self, layer_list, outputs, csp_stack_size):
+    layer_list[0] = 'csp_route'
+    layer_list.insert(csp_stack_size - 1, 'csp_connect')
+    outputs.insert(csp_stack_size - 1, False)
+    return layer_list, outputs
+
+  def _insert_sam(self, layer_list, outputs):
+    if len(layer_list) >= 2 and layer_list[-2] != 'spp':
+      layer_list.insert(-2, 'sam')
+      outputs.insert(-1, True)
+    else:
+      layer_list.insert(-1, 'sam')
+      outputs.insert(-1, False)
+    return layer_list
+
+  def _conv1(self, filters, kwargs, csp=False):
+    if csp:
+      filters_ = self._csp_filters
+    else:
+      filters_ = self._conv1_filters
+
+    x1 = ConvBN(
+        filters=filters_(filters),
+        kernel_size=self._conv1_kernel,
+        strides=(1, 1),
+        padding='same',
+        use_bn=True,
+        **kwargs)
+    return x1
+
+  def _conv2(self, filters, kwargs, csp=False):
+    if csp:
+      filters_ = self._csp_filters
+    else:
+      filters_ = self._conv2_filters
+
+    x1 = ConvBN(
+        filters=filters_(filters),
+        kernel_size=self._conv2_kernel,
+        strides=(1, 1),
+        padding='same',
+        use_bn=True,
+        **kwargs)
+    return x1
+
+  def _csp_route(self, filters, kwargs):
+    x1 = CSPRoute(
+        filters=filters,
+        filter_scale=self._csp_scale,
+        downsample=False,
+        **kwargs)
+    return x1
+
+  def _csp_connect(self, filters, kwargs):
+    x1 = CSPConnect(filters=filters, drop_final=True, drop_first=True, **kwargs)
+    return x1
+
+  def _spp(self, filters, kwargs):
+    x1 = SPP(self._spp_keys)
+    return x1
+
+  def _sam(self, filters, kwargs):
+    x1 = SAM(filters=-1, use_pooling=False, use_bn=True, **kwargs)
+    return x1
+
+  def build(self, input_shape):
+    dark_conv_args = {
+        'activation': self._activation,
+        'kernel_initializer': self._kernel_initializer,
+        'bias_initializer': self._bias_initializer,
+        'bias_regularizer': self._bias_regularizer,
+        'use_sync_bn': self._use_sync_bn,
+        'norm_momentum': self._norm_momentum,
+        'norm_epsilon': self._norm_epsilon,
+        'kernel_regularizer': self._kernel_regularizer,
+        'leaky_alpha': self._leaky_alpha,
+    }
+
+    csp = False
+    self.layers = []
+    for layer in self.layer_list:
+      if layer == 'csp_route':
+        self.layers.append(self._csp_route(self._filters, dark_conv_args))
+        csp = True
+      elif layer == 'csp_connect':
+        self.layers.append(self._csp_connect(self._filters, dark_conv_args))
+        csp = False
+      elif layer == 'conv1':
+        self.layers.append(self._conv1(self._filters, dark_conv_args, csp=csp))
+      elif layer == 'conv2':
+        self.layers.append(self._conv2(self._filters, dark_conv_args, csp=csp))
+      elif layer == 'spp':
+        self.layers.append(self._spp(self._filters, dark_conv_args))
+      elif layer == 'sam':
+        self.layers.append(self._sam(-1, _args))
+
+    self._lim = len(self.layers)
+    super().build(input_shape)
+
+  def _call_regular(self, inputs, training=None):
+    # check efficiency
+    x = inputs
+    x_prev = x
+    output_prev = True
+
+    for i, (layer, output) in enumerate(zip(self.layers, self.outputs)):
+      if output_prev:
+        x_prev = x
+      x = layer(x)
+      output_prev = output
+    return x_prev, x
+
+  def _call_csp(self, inputs, training=None):
+    # check efficiency
+    x = inputs
+    x_prev = x
+    output_prev = True
+    x_route = None
+
+    for i, (layer, output) in enumerate(zip(self.layers, self.outputs)):
+      if output_prev:
+        x_prev = x
+      if i == 0:
+        x, x_route = layer(x)
+      elif i == self._csp_stack - 1:
+        x = layer([x, x_route])
+      else:
+        x = layer(x)
+      output_prev = output
+    return x_prev, x
+
+  def call(self, inputs, training=None):
+    if self._csp_stack > 0:
+      return self._call_csp(inputs, training=training)
+    else:
+      return self._call_regular(inputs)
diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
index 5df28a4f3fb..6664a80e722 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks_test.py
@@ -13,18 +13,17 @@
 # limitations under the License.
 
 # Lint as: python3
-
-from absl.testing import parameterized
-import numpy as np
 import tensorflow as tf
+import numpy as np
+from absl.testing import parameterized
 
 from official.vision.beta.projects.yolo.modeling.layers import nn_blocks
 
 
 class CSPConnectTest(tf.test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(("same", 224, 224, 64, 1),
-                                  ("downsample", 224, 224, 64, 2))
+  @parameterized.named_parameters(('same', 224, 224, 64, 1),
+                                  ('downsample', 224, 224, 64, 2))
   def test_pass_through(self, width, height, filters, mod):
     x = tf.keras.Input(shape=(width, height, filters))
     test_layer = nn_blocks.CSPRoute(filters=filters, filter_scale=mod)
@@ -38,8 +37,8 @@ def test_pass_through(self, width, height, filters, mod):
         [None, np.ceil(width // 2),
          np.ceil(height // 2), (filters)])
 
-  @parameterized.named_parameters(("same", 224, 224, 64, 1),
-                                  ("downsample", 224, 224, 128, 2))
+  @parameterized.named_parameters(('same', 224, 224, 64, 1),
+                                  ('downsample', 224, 224, 128, 2))
   def test_gradient_pass_though(self, filters, width, height, mod):
     loss = tf.keras.losses.MeanSquaredError()
     optimizer = tf.keras.optimizers.SGD()
@@ -49,10 +48,11 @@ def test_gradient_pass_though(self, filters, width, height, mod):
     init = tf.random_normal_initializer()
     x = tf.Variable(
         initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-    y = tf.Variable(initial_value=init(shape=(1, int(np.ceil(width // 2)),
-                                              int(np.ceil(height // 2)),
-                                              filters),
-                                       dtype=tf.float32))
+    y = tf.Variable(
+        initial_value=init(
+            shape=(1, int(np.ceil(width // 2)), int(np.ceil(height // 2)),
+                   filters),
+            dtype=tf.float32))
 
     with tf.GradientTape() as tape:
       x_hat, x_prev = test_layer(x)
@@ -66,12 +66,12 @@ def test_gradient_pass_though(self, filters, width, height, mod):
 
 class CSPRouteTest(tf.test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(("same", 224, 224, 64, 1),
-                                  ("downsample", 224, 224, 64, 2))
+  @parameterized.named_parameters(('same', 224, 224, 64, 1),
+                                  ('downsample', 224, 224, 64, 2))
   def test_pass_through(self, width, height, filters, mod):
     x = tf.keras.Input(shape=(width, height, filters))
     test_layer = nn_blocks.CSPRoute(filters=filters, filter_scale=mod)
-    outx, _ = test_layer(x)
+    outx, px = test_layer(x)
     print(outx)
     print(outx.shape.as_list())
     self.assertAllEqual(
@@ -79,8 +79,8 @@ def test_pass_through(self, width, height, filters, mod):
         [None, np.ceil(width // 2),
          np.ceil(height // 2), (filters / mod)])
 
-  @parameterized.named_parameters(("same", 224, 224, 64, 1),
-                                  ("downsample", 224, 224, 128, 2))
+  @parameterized.named_parameters(('same', 224, 224, 64, 1),
+                                  ('downsample', 224, 224, 128, 2))
   def test_gradient_pass_though(self, filters, width, height, mod):
     loss = tf.keras.losses.MeanSquaredError()
     optimizer = tf.keras.optimizers.SGD()
@@ -90,10 +90,11 @@ def test_gradient_pass_though(self, filters, width, height, mod):
     init = tf.random_normal_initializer()
     x = tf.Variable(
         initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-    y = tf.Variable(initial_value=init(shape=(1, int(np.ceil(width // 2)),
-                                              int(np.ceil(height // 2)),
-                                              filters),
-                                       dtype=tf.float32))
+    y = tf.Variable(
+        initial_value=init(
+            shape=(1, int(np.ceil(width // 2)), int(np.ceil(height // 2)),
+                   filters),
+            dtype=tf.float32))
 
     with tf.GradientTape() as tape:
       x_hat, x_prev = test_layer(x)
@@ -107,11 +108,11 @@ def test_gradient_pass_though(self, filters, width, height, mod):
 
 class CSPStackTest(tf.test.TestCase, parameterized.TestCase):
 
-  def build_layer(
-      self, layer_type, filters, filter_scale, count, stack_type, downsample):
+  def build_layer(self, layer_type, filters, filter_scale, count, stack_type,
+                  downsample):
     if stack_type is not None:
       layers = []
-      if layer_type == "residual":
+      if layer_type == 'residual':
         for _ in range(count):
           layers.append(
               nn_blocks.DarkResidual(
@@ -120,7 +121,7 @@ def build_layer(
         for _ in range(count):
           layers.append(nn_blocks.ConvBN(filters=filters))
 
-      if stack_type == "model":
+      if stack_type == 'model':
         layers = tf.keras.Sequential(layers=layers)
     else:
       layers = None
@@ -133,10 +134,10 @@ def build_layer(
     return stack
 
   @parameterized.named_parameters(
-      ("no_stack", 224, 224, 64, 2, "residual", None, 0, True),
-      ("residual_stack", 224, 224, 64, 2, "residual", "list", 2, True),
-      ("conv_stack", 224, 224, 64, 2, "conv", "list", 3, False),
-      ("callable_no_scale", 224, 224, 64, 1, "residual", "model", 5, False))
+      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
+      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
+      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
+      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
   def test_pass_through(self, width, height, filters, mod, layer_type,
                         stack_type, count, downsample):
     x = tf.keras.Input(shape=(width, height, filters))
@@ -152,10 +153,10 @@ def test_pass_through(self, width, height, filters, mod, layer_type,
       self.assertAllEqual(outx.shape.as_list(), [None, width, height, filters])
 
   @parameterized.named_parameters(
-      ("no_stack", 224, 224, 64, 2, "residual", None, 0, True),
-      ("residual_stack", 224, 224, 64, 2, "residual", "list", 2, True),
-      ("conv_stack", 224, 224, 64, 2, "conv", "list", 3, False),
-      ("callable_no_scale", 224, 224, 64, 1, "residual", "model", 5, False))
+      ('no_stack', 224, 224, 64, 2, 'residual', None, 0, True),
+      ('residual_stack', 224, 224, 64, 2, 'residual', 'list', 2, True),
+      ('conv_stack', 224, 224, 64, 2, 'conv', 'list', 3, False),
+      ('callable_no_scale', 224, 224, 64, 1, 'residual', 'model', 5, False))
   def test_gradient_pass_though(self, width, height, filters, mod, layer_type,
                                 stack_type, count, downsample):
     loss = tf.keras.losses.MeanSquaredError()
@@ -188,10 +189,10 @@ def test_gradient_pass_though(self, width, height, filters, mod, layer_type,
 class ConvBNTest(tf.test.TestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(
-      ("valid", (3, 3), "valid", (1, 1)), ("same", (3, 3), "same", (1, 1)),
-      ("downsample", (3, 3), "same", (2, 2)), ("test", (1, 1), "valid", (1, 1)))
+      ('valid', (3, 3), 'valid', (1, 1)), ('same', (3, 3), 'same', (1, 1)),
+      ('downsample', (3, 3), 'same', (2, 2)), ('test', (1, 1), 'valid', (1, 1)))
   def test_pass_through(self, kernel_size, padding, strides):
-    if padding == "same":
+    if padding == 'same':
       pad_const = 1
     else:
       pad_const = 0
@@ -212,16 +213,16 @@ def test_pass_through(self, kernel_size, padding, strides):
     print(test)
     self.assertAllEqual(outx.shape.as_list(), test)
 
-  @parameterized.named_parameters(("filters", 3))
+  @parameterized.named_parameters(('filters', 3))
   def test_gradient_pass_though(self, filters):
     loss = tf.keras.losses.MeanSquaredError()
     optimizer = tf.keras.optimizers.SGD()
-    with tf.device("/CPU:0"):
-      test_layer = nn_blocks.ConvBN(filters, kernel_size=(3, 3), padding="same")
+    with tf.device('/CPU:0'):
+      test_layer = nn_blocks.ConvBN(filters, kernel_size=(3, 3), padding='same')
 
     init = tf.random_normal_initializer()
-    x = tf.Variable(initial_value=init(shape=(1, 224, 224,
-                                              3), dtype=tf.float32))
+    x = tf.Variable(
+        initial_value=init(shape=(1, 224, 224, 3), dtype=tf.float32))
     y = tf.Variable(
         initial_value=init(shape=(1, 224, 224, filters), dtype=tf.float32))
 
@@ -235,9 +236,9 @@ def test_gradient_pass_though(self, filters):
 
 class DarkResidualTest(tf.test.TestCase, parameterized.TestCase):
 
-  @parameterized.named_parameters(("same", 224, 224, 64, False),
-                                  ("downsample", 223, 223, 32, True),
-                                  ("oddball", 223, 223, 32, False))
+  @parameterized.named_parameters(('same', 224, 224, 64, False),
+                                  ('downsample', 223, 223, 32, True),
+                                  ('oddball', 223, 223, 32, False))
   def test_pass_through(self, width, height, filters, downsample):
     mod = 1
     if downsample:
@@ -252,9 +253,9 @@ def test_pass_through(self, width, height, filters, downsample):
         [None, np.ceil(width / mod),
          np.ceil(height / mod), filters])
 
-  @parameterized.named_parameters(("same", 64, 224, 224, False),
-                                  ("downsample", 32, 223, 223, True),
-                                  ("oddball", 32, 223, 223, False))
+  @parameterized.named_parameters(('same', 64, 224, 224, False),
+                                  ('downsample', 32, 223, 223, True),
+                                  ('oddball', 32, 223, 223, False))
   def test_gradient_pass_though(self, filters, width, height, downsample):
     loss = tf.keras.losses.MeanSquaredError()
     optimizer = tf.keras.optimizers.SGD()
@@ -268,10 +269,11 @@ def test_gradient_pass_though(self, filters, width, height, downsample):
     init = tf.random_normal_initializer()
     x = tf.Variable(
         initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
-    y = tf.Variable(initial_value=init(shape=(1, int(np.ceil(width / mod)),
-                                              int(np.ceil(height / mod)),
-                                              filters),
-                                       dtype=tf.float32))
+    y = tf.Variable(
+        initial_value=init(
+            shape=(1, int(np.ceil(width / mod)), int(np.ceil(height / mod)),
+                   filters),
+            dtype=tf.float32))
 
     with tf.GradientTape() as tape:
       x_hat = test_layer(x)
@@ -281,5 +283,102 @@ def test_gradient_pass_though(self, filters, width, height, downsample):
 
     self.assertNotIn(None, grad)
 
-if __name__ == "__main__":
+
+class DarkSppTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(('RouteProcessSpp', 224, 224, 3, [5, 9, 13]),
+                                  ('test1', 300, 300, 10, [2, 3, 4, 5]),
+                                  ('test2', 256, 256, 5, [10]))
+  def test_pass_through(self, width, height, channels, sizes):
+    x = tf.keras.Input(shape=(width, height, channels))
+    test_layer = nn_blocks.SPP(sizes=sizes)
+    outx = test_layer(x)
+    self.assertAllEqual(outx.shape.as_list(),
+                        [None, width, height, channels * (len(sizes) + 1)])
+    return
+
+  @parameterized.named_parameters(('RouteProcessSpp', 224, 224, 3, [5, 9, 13]),
+                                  ('test1', 300, 300, 10, [2, 3, 4, 5]),
+                                  ('test2', 256, 256, 5, [10]))
+  def test_gradient_pass_though(self, width, height, channels, sizes):
+    loss = tf.keras.losses.MeanSquaredError()
+    optimizer = tf.keras.optimizers.SGD()
+    test_layer = nn_blocks.SPP(sizes=sizes)
+
+    init = tf.random_normal_initializer()
+    x = tf.Variable(
+        initial_value=init(
+            shape=(1, width, height, channels), dtype=tf.float32))
+    y = tf.Variable(
+        initial_value=init(
+            shape=(1, width, height, channels * (len(sizes) + 1)),
+            dtype=tf.float32))
+
+    with tf.GradientTape() as tape:
+      x_hat = test_layer(x)
+      grad_loss = loss(x_hat, y)
+    grad = tape.gradient(grad_loss, test_layer.trainable_variables)
+    optimizer.apply_gradients(zip(grad, test_layer.trainable_variables))
+
+    self.assertNotIn(None, grad)
+    return
+
+
+class DarkRouteProcessTest(tf.test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('test1', 224, 224, 64, 7, False), ('test2', 223, 223, 32, 3, False),
+      ('tiny', 223, 223, 16, 1, False), ('spp', 224, 224, 64, 7, False))
+  def test_pass_through(self, width, height, filters, repetitions, spp):
+    x = tf.keras.Input(shape=(width, height, filters))
+    test_layer = nn_blocks.DarkRouteProcess(
+        filters=filters, repetitions=repetitions, insert_spp=spp)
+    outx = test_layer(x)
+    self.assertEqual(len(outx), 2, msg='len(outx) != 2')
+    if repetitions == 1:
+      filter_y1 = filters
+    else:
+      filter_y1 = filters // 2
+    self.assertAllEqual(outx[1].shape.as_list(), [None, width, height, filter_y1])
+    self.assertAllEqual(
+        filters % 2,
+        0,
+        msg='Output of a DarkRouteProcess layer has an odd number of filters')
+    self.assertAllEqual(outx[0].shape.as_list(), [None, width, height, filters])
+
+  @parameterized.named_parameters(
+      ('test1', 224, 224, 64, 7, False), ('test2', 223, 223, 32, 3, False),
+      ('tiny', 223, 223, 16, 1, False), ('spp', 224, 224, 64, 7, False))
+  def test_gradient_pass_though(self, width, height, filters, repetitions, spp):
+    loss = tf.keras.losses.MeanSquaredError()
+    optimizer = tf.keras.optimizers.SGD()
+    test_layer = nn_blocks.DarkRouteProcess(
+        filters=filters, repetitions=repetitions, insert_spp=spp)
+
+    if repetitions == 1:
+      filter_y1 = filters
+    else:
+      filter_y1 = filters // 2
+
+    init = tf.random_normal_initializer()
+    x = tf.Variable(
+        initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
+    y_0 = tf.Variable(
+        initial_value=init(shape=(1, width, height, filters), dtype=tf.float32))
+    y_1 = tf.Variable(
+        initial_value=init(shape=(1, width, height, filter_y1), dtype=tf.float32))
+
+    with tf.GradientTape() as tape:
+      x_hat_0, x_hat_1 = test_layer(x)
+      grad_loss_0 = loss(x_hat_0, y_0)
+      grad_loss_1 = loss(x_hat_1, y_1)
+    grad = tape.gradient([grad_loss_0, grad_loss_1],
+                         test_layer.trainable_variables)
+    optimizer.apply_gradients(zip(grad, test_layer.trainable_variables))
+
+    self.assertNotIn(None, grad)
+    return
+
+
+if __name__ == '__main__':
   tf.test.main()

From 725b8c8c85d36fbee58f0025364f4e1acdcb0c94 Mon Sep 17 00:00:00 2001
From: Anirudh Vegesana <anirudh.vegesana@gmail.com>
Date: Wed, 26 May 2021 23:39:14 -0400
Subject: [PATCH 09/10] disclaimer (#10020)

Co-authored-by: Vishnu Banna <43182884+vishnubanna@users.noreply.github.com>
---
 official/vision/beta/projects/yolo/README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/official/vision/beta/projects/yolo/README.md b/official/vision/beta/projects/yolo/README.md
index 0a1e27fbe90..166ac8d9842 100644
--- a/official/vision/beta/projects/yolo/README.md
+++ b/official/vision/beta/projects/yolo/README.md
@@ -74,3 +74,7 @@ head could be connected to a new, more powerful backbone if a person chose to.
 
 [![TensorFlow 2.2](https://img.shields.io/badge/TensorFlow-2.2-FF6F00?logo=tensorflow)](https://github.com/tensorflow/tensorflow/releases/tag/v2.2.0)
 [![Python 3.8](https://img.shields.io/badge/Python-3.8-3776AB)](https://www.python.org/downloads/release/python-380/)
+
+
+
+DISCLAIMER: this YOLO implementation is still under development. No support will be provided during the development phase.

From 9dd1c2c0aa2d4f4508543e9c34c2aa76678f8706 Mon Sep 17 00:00:00 2001
From: anivegesana <anirudh.vegesana@gmail.com>
Date: Thu, 27 May 2021 19:07:15 -0400
Subject: [PATCH 10/10] Fix some PyLint errors

---
 .../projects/yolo/modeling/decoders/yolo_decoder.py  | 12 ++++++------
 .../beta/projects/yolo/modeling/layers/nn_blocks.py  |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
index 1dbaae6ebf1..ae66e3797cf 100644
--- a/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
+++ b/official/vision/beta/projects/yolo/modeling/decoders/yolo_decoder.py
@@ -331,13 +331,13 @@ def get_raw_depths(self, minimum_depth, inputs):
 
     Args:
       minimum_depth: `int` depth of the smallest branch of the FPN.
-      inputs: `dict[str, tf.InputSpec]` of the shape of input args as a dictionary of
-        lists.
+      inputs: `dict[str, tf.InputSpec]` of the shape of input args as a
+        dictionary of lists.
 
     Returns:
       The unscaled depths of the FPN branches.
     """
-    
+
     depths = []
     if len(inputs.keys()) > 3 or self._fpn_filter_scale > 1:
       for i in range(self._min_level, self._max_level + 1):
@@ -386,8 +386,8 @@ def __init__(self,
                kernel_regularizer=None,
                bias_regularizer=None,
                **kwargs):
-    """Yolo Decoder initialization function. A unified model that ties all decoder
-    components into a conditionally build YOLO decder.
+    """Yolo Decoder initialization function. A unified model that ties all
+    decoder components into a conditionally build YOLO decoder.
 
     Args:
       input_specs: `dict[str, tf.InputSpec]`: input specs of each of the inputs
@@ -409,7 +409,7 @@ def __init__(self,
         zero.
       kernel_initializer: kernel_initializer for convolutional layers.
       kernel_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
-      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2d.
+      bias_regularizer: tf.keras.regularizers.Regularizer object for Conv2D.
       **kwargs: keyword arguments to be passed.
     """
 
diff --git a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
index 9897def3ad3..02895ff3db4 100644
--- a/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
+++ b/official/vision/beta/projects/yolo/modeling/layers/nn_blocks.py
@@ -1152,8 +1152,8 @@ def build(self, input_shape):
 
   def call(self, inputs, training=None):
     if self._use_pooling:
-      depth_max = tf.reduce_max(inputs, axis=-1, keep_dims=True)
-      depth_avg = tf.reduce_mean(inputs, axis=-1, keep_dims=True)
+      depth_max = tf.reduce_max(inputs, axis=-1, keepdims=True)
+      depth_avg = tf.reduce_mean(inputs, axis=-1, keepdims=True)
       input_maps = tf.concat([depth_avg, depth_max], axis=-1)
     else:
       input_maps = inputs
@@ -1545,7 +1545,7 @@ def build(self, input_shape):
       elif layer == 'spp':
         self.layers.append(self._spp(self._filters, dark_conv_args))
       elif layer == 'sam':
-        self.layers.append(self._sam(-1, _args))
+        self.layers.append(self._sam(-1, dark_conv_args))
 
     self._lim = len(self.layers)
     super().build(input_shape)