Apply isort and black reformatting

Signed-off-by: huvunvidia <[email protected]>
NVIDIA · Oct 2, 2024 · 1deddfe · 1deddfe
1 parent ff11a07
commit 1deddfe
Show file tree

Hide file tree

Showing 8 changed files with 51 additions and 68 deletions.
diff --git a/examples/llm/megatron_t5_finetuning.py b/examples/llm/megatron_t5_finetuning.py
@@ -14,8 +14,8 @@
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
-from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 
 
 def get_args():
@@ -43,9 +43,9 @@ def get_args():
     )
 
     data = SquadDataModule(
-        seq_length=512, 
-        seq_length_dec=128, 
-        micro_batch_size=16, 
+        seq_length=512,
+        seq_length_dec=128,
+        micro_batch_size=16,
         global_batch_size=128,
         tokenizer=tokenizer,
         num_workers=4,
@@ -98,7 +98,7 @@ def get_args():
     )
     opt = MegatronOptimizerModule(
         config=opt_config,
-        )
+    )
 
     trainer = nl.Trainer(
         devices=args.devices,

diff --git a/examples/nlp/language_modeling/megatron_t5_pretraining.py b/examples/nlp/language_modeling/megatron_t5_pretraining.py
@@ -34,6 +34,7 @@ def main(cfg) -> None:
 
     # DEBUGGING
     import torch
+
     print("model (before): ")
     print(model)
     for name, param in model.named_parameters():
@@ -47,7 +48,7 @@ def main(cfg) -> None:
     trainer.fit(model)
 
     # DEBUGGING
-    if torch.distributed.get_rank()==0:
+    if torch.distributed.get_rank() == 0:
         print("model (after): ")
         print(model)
         for name, param in model.named_parameters():

diff --git a/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py b/examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py
@@ -228,7 +228,8 @@ def main(cfg) -> None:
 
     # DEBUGGING
     import torch
-    if torch.distributed.get_rank()==0:
+
+    if torch.distributed.get_rank() == 0:
         print("model: ")
         print(model)
         for name, param in model.named_parameters():

diff --git a/nemo/collections/llm/t5/data/core.py b/nemo/collections/llm/t5/data/core.py
@@ -43,4 +43,4 @@ def create_sft_dataset(
         add_eos_to_input=add_eos,
         replace_bos_with_pad=replace_bos_with_pad,
         index_mapping_dir=index_mapping_dir,
-    )
+    )
diff --git a/nemo/collections/llm/t5/data/fine_tuning.py b/nemo/collections/llm/t5/data/fine_tuning.py
@@ -57,10 +57,9 @@ def __init__(
         self.seed = seed
         self.dataset_root = Path(dataset_root)
 
-        from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-
         # add additional tokens for T5 tokenizer
         from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
         self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
         additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
         self.tokenizer.add_special_tokens(additional_tokens)

diff --git a/nemo/collections/llm/t5/model/t5.py b/nemo/collections/llm/t5/model/t5.py
@@ -11,13 +11,12 @@
 from torch import nn
 
 from nemo.collections.llm import fn
+from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
+from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
 
-from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
-from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
-
 HAVE_TE = True
 try:
     import transformer_engine
@@ -42,22 +41,16 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     else:
         _batch = batch
 
-    # if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD) 
+    # if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
     if 'enc_dec_mask' not in _batch:
-        encoder_attn_mask_3d = build_attention_mask_3d(
-            _batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding
-        )
-        decoder_attn_mask_3d = build_attention_mask_3d(
-            _batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal
-        )
-        enc_dec_attn_mask_3d = build_attention_mask_3d(
-            _batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding
-        )
+        encoder_attn_mask_3d = build_attention_mask_3d(_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding)
+        decoder_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal)
+        enc_dec_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding)
         _batch['enc_mask'] = encoder_attn_mask_3d
         _batch['dec_mask'] = decoder_attn_mask_3d
         _batch['enc_dec_mask'] = enc_dec_attn_mask_3d
 
-    # if Dataset object is Mcore T5 dataset (e.g. pretraining) 
+    # if Dataset object is Mcore T5 dataset (e.g. pretraining)
     else:
         # convert attention mask values from int to True/False
         _batch['enc_mask'] = _batch['enc_mask'] < 0.5
@@ -192,7 +185,7 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
         print(model)
         print("encoder_config: ", encoder_config)
         torch.set_printoptions(precision=20)
-        if torch.distributed.get_rank()==0:
+        if torch.distributed.get_rank() == 0:
             for name, param in model.named_parameters():
                 print("{}: {} - {}".format(name, param.shape, torch.norm(param)))
                 if "embedding.position_embeddings.weight" in name:
@@ -202,7 +195,7 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
                     print(param.shape)
                     print(param.mean())
                     print(param.std())
-        print(stop_here)        
+        print(stop_here)
 
         return model
 

diff --git a/nemo/collections/llm/t5/model/t5_sft.py b/nemo/collections/llm/t5/model/t5_sft.py
@@ -1,6 +1,6 @@
+import copy
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
-import copy
 
 import pytorch_lightning as L
 import torch
@@ -11,13 +11,12 @@
 from torch import nn
 
 from nemo.collections.llm import fn
+from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
+from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
 
-from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
-from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
-
 HAVE_TE = True
 try:
     import transformer_engine
@@ -45,22 +44,16 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     else:
         _batch = batch
 
-    # if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD) 
+    # if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
     if 'enc_dec_mask' not in _batch:
-        encoder_attn_mask_3d = build_attention_mask_3d(
-            _batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding
-        )
-        decoder_attn_mask_3d = build_attention_mask_3d(
-            _batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal
-        )
-        enc_dec_attn_mask_3d = build_attention_mask_3d(
-            _batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding
-        )
+        encoder_attn_mask_3d = build_attention_mask_3d(_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding)
+        decoder_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal)
+        enc_dec_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding)
         _batch['enc_mask'] = encoder_attn_mask_3d
         _batch['dec_mask'] = decoder_attn_mask_3d
         _batch['enc_dec_mask'] = enc_dec_attn_mask_3d
 
-    # if Dataset object is Mcore T5 dataset (e.g. pretraining) 
+    # if Dataset object is Mcore T5 dataset (e.g. pretraining)
     else:
         # convert attention mask values from int to True/False
         _batch['enc_mask'] = _batch['enc_mask'] < 0.5
@@ -76,7 +69,6 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
     # if self.get_attention_mask_from_fusion:
     #     required_keys.remove('attention_mask')
 
-
     output = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}
 
     return output
@@ -97,8 +89,8 @@ def t5_forward_step(model, batch) -> torch.Tensor:
 
 def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
     from megatron.core.models.T5.t5_spec import (
-        get_t5_encoder_with_transformer_engine_block_spec,
         get_t5_decoder_with_transformer_engine_block_spec,
+        get_t5_encoder_with_transformer_engine_block_spec,
     )
 
     en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(encoder_config.num_layers)
@@ -109,8 +101,8 @@ def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T
 
 def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
     from megatron.core.models.T5.t5_spec import (
-        get_t5_encoder_with_local_block_spec,
         get_t5_decoder_with_local_block_spec,
+        get_t5_encoder_with_local_block_spec,
     )
 
     en_block_spec = get_t5_encoder_with_local_block_spec(encoder_config.num_layers)
@@ -119,7 +111,6 @@ def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") ->
     return [en_block_spec, de_block_spec]
 
 
-
 def default_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
     if HAVE_TE:
         return transformer_engine_layer_spec(encoder_config, decoder_config)
@@ -158,23 +149,21 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
         from megatron.core import parallel_state
         from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model
 
-
         # DEBUGGING
-        if torch.distributed.get_rank()==0:
+        if torch.distributed.get_rank() == 0:
             print("Debugging: matching NeMo 1.0 Transformers config.")
-        self.enable_autocast=True
-        self.autocast_dtype=torch.bfloat16
-        self.deallocate_pipeline_outputs=True
-        self.pipeline_model_parallel_split_rank=0
-        self.attention_softmax_in_fp32=False
-        self.bias_activation_fusion=True
-        self.masked_softmax_fusion=True
-        self.persist_layer_norm=True
-        self.bias_dropout_fusion=True
-        self.recompute_num_layers=1
-        self.num_moe_experts=1
-        self.distribute_saved_activations=False
-
+        self.enable_autocast = True
+        self.autocast_dtype = torch.bfloat16
+        self.deallocate_pipeline_outputs = True
+        self.pipeline_model_parallel_split_rank = 0
+        self.attention_softmax_in_fp32 = False
+        self.bias_activation_fusion = True
+        self.masked_softmax_fusion = True
+        self.persist_layer_norm = True
+        self.bias_dropout_fusion = True
+        self.recompute_num_layers = 1
+        self.num_moe_experts = 1
+        self.distribute_saved_activations = False
 
         encoder_config = copy.deepcopy(self)
         encoder_config.num_layers = self.encoder_num_layers
@@ -204,15 +193,13 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
             post_process=parallel_state.is_pipeline_last_stage(),
         )
 
-
         # DEBUGGING
-        if torch.distributed.get_rank()==0:
+        if torch.distributed.get_rank() == 0:
             print("model: ")
             print(model)
             for name, param in model.named_parameters():
                 print("{}: {} - {}".format(name, param.shape, torch.norm(param)))
 
-
         return model
 
 
@@ -240,6 +227,7 @@ def configure_model(self) -> None:
 
             # DEBUGGING
             from megatron.core.enums import ModelType
+
             self.module.model_type = ModelType.encoder_and_decoder
 
     def forward(
@@ -293,6 +281,7 @@ def validation_loss_reduction(self) -> MaskedTokenLossReduction:
 
         return self._validation_loss_reduction
 
+
 __all__ = [
     "T5Model",
     "T5Config",

diff --git a/tests/collections/llm/megatron_t5_finetuning.py b/tests/collections/llm/megatron_t5_finetuning.py
@@ -14,8 +14,8 @@
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import ModelCheckpoint
-from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
+from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
 
 
 def get_args():
@@ -43,9 +43,9 @@ def get_args():
     )
 
     data = SquadDataModule(
-        seq_length=512, 
-        seq_length_dec=128, 
-        micro_batch_size=16, 
+        seq_length=512,
+        seq_length_dec=128,
+        micro_batch_size=16,
         global_batch_size=128,
         tokenizer=tokenizer,
         num_workers=4,
@@ -98,7 +98,7 @@ def get_args():
     )
     opt = MegatronOptimizerModule(
         config=opt_config,
-        )
+    )
 
     trainer = nl.Trainer(
         devices=args.devices,