Skip to content

Commit

Permalink
Apply isort and black reformatting
Browse files Browse the repository at this point in the history
Signed-off-by: huvunvidia <[email protected]>
  • Loading branch information
huvunvidia committed Oct 2, 2024
1 parent ff11a07 commit 1deddfe
Show file tree
Hide file tree
Showing 8 changed files with 51 additions and 68 deletions.
10 changes: 5 additions & 5 deletions examples/llm/megatron_t5_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning import NeMoLogger
from nemo.lightning.pytorch.callbacks import ModelCheckpoint
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule


def get_args():
Expand Down Expand Up @@ -43,9 +43,9 @@ def get_args():
)

data = SquadDataModule(
seq_length=512,
seq_length_dec=128,
micro_batch_size=16,
seq_length=512,
seq_length_dec=128,
micro_batch_size=16,
global_batch_size=128,
tokenizer=tokenizer,
num_workers=4,
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_args():
)
opt = MegatronOptimizerModule(
config=opt_config,
)
)

trainer = nl.Trainer(
devices=args.devices,
Expand Down
3 changes: 2 additions & 1 deletion examples/nlp/language_modeling/megatron_t5_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def main(cfg) -> None:

# DEBUGGING
import torch

print("model (before): ")
print(model)
for name, param in model.named_parameters():
Expand All @@ -47,7 +48,7 @@ def main(cfg) -> None:
trainer.fit(model)

# DEBUGGING
if torch.distributed.get_rank()==0:
if torch.distributed.get_rank() == 0:
print("model (after): ")
print(model)
for name, param in model.named_parameters():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ def main(cfg) -> None:

# DEBUGGING
import torch
if torch.distributed.get_rank()==0:

if torch.distributed.get_rank() == 0:
print("model: ")
print(model)
for name, param in model.named_parameters():
Expand Down
2 changes: 1 addition & 1 deletion nemo/collections/llm/t5/data/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ def create_sft_dataset(
add_eos_to_input=add_eos,
replace_bos_with_pad=replace_bos_with_pad,
index_mapping_dir=index_mapping_dir,
)
)
3 changes: 1 addition & 2 deletions nemo/collections/llm/t5/data/fine_tuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,9 @@ def __init__(
self.seed = seed
self.dataset_root = Path(dataset_root)

from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer

# add additional tokens for T5 tokenizer
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer

self.tokenizer = tokenizer or get_nmt_tokenizer("megatron", "BertWordPieceCase")
additional_tokens = {'additional_special_tokens': [f'<extra_id_{i}>' for i in range(100)]}
self.tokenizer.add_special_tokens(additional_tokens)
Expand Down
25 changes: 9 additions & 16 deletions nemo/collections/llm/t5/model/t5.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,12 @@
from torch import nn

from nemo.collections.llm import fn
from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
from nemo.lightning import get_vocab_size, io
from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule

from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d

HAVE_TE = True
try:
import transformer_engine
Expand All @@ -42,22 +41,16 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
else:
_batch = batch

# if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
# if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
if 'enc_dec_mask' not in _batch:
encoder_attn_mask_3d = build_attention_mask_3d(
_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding
)
decoder_attn_mask_3d = build_attention_mask_3d(
_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal
)
enc_dec_attn_mask_3d = build_attention_mask_3d(
_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding
)
encoder_attn_mask_3d = build_attention_mask_3d(_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding)
decoder_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal)
enc_dec_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding)
_batch['enc_mask'] = encoder_attn_mask_3d
_batch['dec_mask'] = decoder_attn_mask_3d
_batch['enc_dec_mask'] = enc_dec_attn_mask_3d

# if Dataset object is Mcore T5 dataset (e.g. pretraining)
# if Dataset object is Mcore T5 dataset (e.g. pretraining)
else:
# convert attention mask values from int to True/False
_batch['enc_mask'] = _batch['enc_mask'] < 0.5
Expand Down Expand Up @@ -192,7 +185,7 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
print(model)
print("encoder_config: ", encoder_config)
torch.set_printoptions(precision=20)
if torch.distributed.get_rank()==0:
if torch.distributed.get_rank() == 0:
for name, param in model.named_parameters():
print("{}: {} - {}".format(name, param.shape, torch.norm(param)))
if "embedding.position_embeddings.weight" in name:
Expand All @@ -202,7 +195,7 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
print(param.shape)
print(param.mean())
print(param.std())
print(stop_here)
print(stop_here)

return model

Expand Down
63 changes: 26 additions & 37 deletions nemo/collections/llm/t5/model/t5_sft.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import copy
from dataclasses import dataclass
from typing import TYPE_CHECKING, Callable, Dict, Literal, Optional, Union
import copy

import pytorch_lightning as L
import torch
Expand All @@ -11,13 +11,12 @@
from torch import nn

from nemo.collections.llm import fn
from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d
from nemo.lightning import get_vocab_size, io
from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule

from nemo.collections.nlp.modules.common.megatron.token_level_encoder_decoder import AttnMaskType
from nemo.collections.nlp.modules.common.megatron.utils import build_attention_mask_3d

HAVE_TE = True
try:
import transformer_engine
Expand Down Expand Up @@ -45,22 +44,16 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
else:
_batch = batch

# if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
# if Dataset object is NeMo 1.0's T5SFTDataset (e.g. when finetuning with SQUAD)
if 'enc_dec_mask' not in _batch:
encoder_attn_mask_3d = build_attention_mask_3d(
_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding
)
decoder_attn_mask_3d = build_attention_mask_3d(
_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal
)
enc_dec_attn_mask_3d = build_attention_mask_3d(
_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding
)
encoder_attn_mask_3d = build_attention_mask_3d(_batch['enc_mask'], _batch['enc_mask'], AttnMaskType.padding)
decoder_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['dec_mask'], AttnMaskType.causal)
enc_dec_attn_mask_3d = build_attention_mask_3d(_batch['dec_mask'], _batch['enc_mask'], AttnMaskType.padding)
_batch['enc_mask'] = encoder_attn_mask_3d
_batch['dec_mask'] = decoder_attn_mask_3d
_batch['enc_dec_mask'] = enc_dec_attn_mask_3d

# if Dataset object is Mcore T5 dataset (e.g. pretraining)
# if Dataset object is Mcore T5 dataset (e.g. pretraining)
else:
# convert attention mask values from int to True/False
_batch['enc_mask'] = _batch['enc_mask'] < 0.5
Expand All @@ -76,7 +69,6 @@ def t5_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
# if self.get_attention_mask_from_fusion:
# required_keys.remove('attention_mask')


output = {key: val.cuda(non_blocking=True) if key in required_keys else None for key, val in _batch.items()}

return output
Expand All @@ -97,8 +89,8 @@ def t5_forward_step(model, batch) -> torch.Tensor:

def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
from megatron.core.models.T5.t5_spec import (
get_t5_encoder_with_transformer_engine_block_spec,
get_t5_decoder_with_transformer_engine_block_spec,
get_t5_encoder_with_transformer_engine_block_spec,
)

en_block_spec = get_t5_encoder_with_transformer_engine_block_spec(encoder_config.num_layers)
Expand All @@ -109,8 +101,8 @@ def transformer_engine_layer_spec(encoder_config: "T5Config", decoder_config: "T

def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
from megatron.core.models.T5.t5_spec import (
get_t5_encoder_with_local_block_spec,
get_t5_decoder_with_local_block_spec,
get_t5_encoder_with_local_block_spec,
)

en_block_spec = get_t5_encoder_with_local_block_spec(encoder_config.num_layers)
Expand All @@ -119,7 +111,6 @@ def local_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") ->
return [en_block_spec, de_block_spec]



def default_layer_spec(encoder_config: "T5Config", decoder_config: "T5Config") -> ModuleSpec:
if HAVE_TE:
return transformer_engine_layer_spec(encoder_config, decoder_config)
Expand Down Expand Up @@ -158,23 +149,21 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
from megatron.core import parallel_state
from megatron.core.models.T5.t5_model import T5Model as MCoreT5Model


# DEBUGGING
if torch.distributed.get_rank()==0:
if torch.distributed.get_rank() == 0:
print("Debugging: matching NeMo 1.0 Transformers config.")
self.enable_autocast=True
self.autocast_dtype=torch.bfloat16
self.deallocate_pipeline_outputs=True
self.pipeline_model_parallel_split_rank=0
self.attention_softmax_in_fp32=False
self.bias_activation_fusion=True
self.masked_softmax_fusion=True
self.persist_layer_norm=True
self.bias_dropout_fusion=True
self.recompute_num_layers=1
self.num_moe_experts=1
self.distribute_saved_activations=False

self.enable_autocast = True
self.autocast_dtype = torch.bfloat16
self.deallocate_pipeline_outputs = True
self.pipeline_model_parallel_split_rank = 0
self.attention_softmax_in_fp32 = False
self.bias_activation_fusion = True
self.masked_softmax_fusion = True
self.persist_layer_norm = True
self.bias_dropout_fusion = True
self.recompute_num_layers = 1
self.num_moe_experts = 1
self.distribute_saved_activations = False

encoder_config = copy.deepcopy(self)
encoder_config.num_layers = self.encoder_num_layers
Expand Down Expand Up @@ -204,15 +193,13 @@ def configure_model(self, tokenizer) -> "MCoreT5Model":
post_process=parallel_state.is_pipeline_last_stage(),
)


# DEBUGGING
if torch.distributed.get_rank()==0:
if torch.distributed.get_rank() == 0:
print("model: ")
print(model)
for name, param in model.named_parameters():
print("{}: {} - {}".format(name, param.shape, torch.norm(param)))


return model


Expand Down Expand Up @@ -240,6 +227,7 @@ def configure_model(self) -> None:

# DEBUGGING
from megatron.core.enums import ModelType

self.module.model_type = ModelType.encoder_and_decoder

def forward(
Expand Down Expand Up @@ -293,6 +281,7 @@ def validation_loss_reduction(self) -> MaskedTokenLossReduction:

return self._validation_loss_reduction


__all__ = [
"T5Model",
"T5Config",
Expand Down
10 changes: 5 additions & 5 deletions tests/collections/llm/megatron_t5_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
from nemo.lightning import NeMoLogger
from nemo.lightning.pytorch.callbacks import ModelCheckpoint
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule
from nemo.lightning.pytorch.optim.lr_scheduler import WarmupAnnealingScheduler

Check notice

Code scanning / CodeQL

Unused import Note test

Import of 'WarmupAnnealingScheduler' is not used.
from nemo.lightning.pytorch.optim.megatron import MegatronOptimizerModule


def get_args():
Expand Down Expand Up @@ -43,9 +43,9 @@ def get_args():
)

data = SquadDataModule(
seq_length=512,
seq_length_dec=128,
micro_batch_size=16,
seq_length=512,
seq_length_dec=128,
micro_batch_size=16,
global_batch_size=128,
tokenizer=tokenizer,
num_workers=4,
Expand Down Expand Up @@ -98,7 +98,7 @@ def get_args():
)
opt = MegatronOptimizerModule(
config=opt_config,
)
)

trainer = nl.Trainer(
devices=args.devices,
Expand Down

0 comments on commit 1deddfe

Please sign in to comment.