diff --git a/elleelleaime/generate/strategies/models/huggingface/codellama/__init__.py b/elleelleaime/generate/strategies/models/huggingface/codellama/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/elleelleaime/generate/strategies/models/huggingface/codellama.py b/elleelleaime/generate/strategies/models/huggingface/codellama/codellama_infilling.py similarity index 79% rename from elleelleaime/generate/strategies/models/huggingface/codellama.py rename to elleelleaime/generate/strategies/models/huggingface/codellama/codellama_infilling.py index bdc28707..2008cec3 100644 --- a/elleelleaime/generate/strategies/models/huggingface/codellama.py +++ b/elleelleaime/generate/strategies/models/huggingface/codellama/codellama_infilling.py @@ -11,20 +11,19 @@ @dataclass class GenerateSettings: name: str - num_beams: int = 1 do_sample: bool = False - temperature: float = 0.0 - max_new_tokens: int = 128 + temperature: float = 1.0 + num_beams: int = 1 num_return_sequences: int = 10 - max_new_tokens: int = 1024 + max_new_tokens: int = 4096 -class CodeLlamaHFModels(PatchGenerationStrategy): +class CodeLLaMAInfilling(PatchGenerationStrategy): __SUPPORTED_MODELS = { - "codellama/CodeLlama-7b-hf", - "codellama/CodeLlama-13b-hf", - "codellama/CodeLlama-7b-Instruct-hf", - "codellama/CodeLlama-13b-Instruct-hf", + "meta-llama/CodeLlama-7b-hf", + "meta-llama/CodeLlama-13b-hf", + "meta-llama/CodeLlama-34b-hf", + "meta-llama/CodeLlama-70b-hf", } __GENERATION_STRATEGIES = { @@ -45,28 +44,34 @@ class CodeLlamaHFModels(PatchGenerationStrategy): def __init__(self, model_name: str, **kwargs) -> None: assert ( model_name in self.__SUPPORTED_MODELS - ), f"Model {model_name} not supported by CodeLlamaHFModels" + ), f"Model {model_name} not supported by {self.__class__.__name__}" self.model_name = model_name self.__load_model() # Generation settings assert ( kwargs.get("generation_strategy", "beam_search") in self.__GENERATION_STRATEGIES - ), f"Generation strategy {kwargs.get('generation_strategy', 'beam_search')} not supported by CodeLlamaHFModels" + ), f"Generation strategy {kwargs.get('generation_strategy', 'beam_search')} not supported by {self.__class__.__name__}" self.generate_settings = self.__GENERATION_STRATEGIES[ kwargs.get("generation_strategy", "beam_search") ] - self.generate_settings.max_new_tokens = kwargs.get("max_new_tokens", 128) + self.generate_settings.max_new_tokens = kwargs.get( + "max_new_tokens", GenerateSettings.max_new_tokens + ) self.generate_settings.num_return_sequences = kwargs.get( - "num_return_sequences", 10 + "num_return_sequences", GenerateSettings.num_return_sequences + ) + self.generate_settings.num_beams = kwargs.get( + "num_beams", GenerateSettings.num_beams + ) + self.generate_settings.temperature = kwargs.get( + "temperature", GenerateSettings.temperature ) - self.generate_settings.num_beams = kwargs.get("num_beams", 1) - self.generate_settings.temperature = kwargs.get("temperature", 0.2) def __load_model(self): # Setup environment self.device = "cuda" if torch.cuda.is_available() else "cpu" - self.context_size = 2048 + self.context_size = 16384 # Setup kwargs kwargs = dict( @@ -81,6 +86,7 @@ def __load_model(self): self.__MODEL = LlamaForCausalLM.from_pretrained( self.model_name, device_map="auto", **kwargs ) + self.__MODEL.eval() self.__MODELS_LOADED = True def _generate_impl(self, prompt: str) -> Any: diff --git a/elleelleaime/generate/strategies/models/huggingface/codellama/codellama_instruct.py b/elleelleaime/generate/strategies/models/huggingface/codellama/codellama_instruct.py new file mode 100644 index 00000000..b85c0c76 --- /dev/null +++ b/elleelleaime/generate/strategies/models/huggingface/codellama/codellama_instruct.py @@ -0,0 +1,135 @@ +from elleelleaime.generate.strategies.strategy import PatchGenerationStrategy +from dataclasses import dataclass +from peft import PeftModel +from transformers import AutoModelForCausalLM, AutoTokenizer +from typing import Any + +import torch +import threading +import logging + + +@dataclass +class GenerateSettings: + name: str + do_sample: bool = False + temperature: float = 1.0 + num_beams: int = 1 + num_return_sequences: int = 10 + max_new_tokens: int = 4096 + + +class CodeLLaMAIntruct(PatchGenerationStrategy): + __SUPPORTED_MODELS = { + "meta-llama/CodeLlama-7b-Instruct-hf", + "meta-llama/CodeLlama-13b-Instruct-hf", + "meta-llama/CodeLlama-34b-Instruct-hf", + "meta-llama/CodeLlama-70b-Instruct-hf", + } + + __GENERATION_STRATEGIES = { + "beam_search": GenerateSettings( + name="beam_search", + ), + "sampling": GenerateSettings( + name="sampling", + do_sample=True, + ), + } + + __MODEL = None + __TOKENIZER = None + __MODELS_LOADED: bool = False + __MODELS_LOCK: threading.Lock = threading.Lock() + + def __init__(self, model_name: str, **kwargs) -> None: + assert ( + model_name in self.__SUPPORTED_MODELS + ), f"Model {model_name} not supported by {self.__class__.__name__}" + self.model_name = model_name + self.__load_model(**kwargs) + # Generation settings + assert ( + kwargs.get("generation_strategy", "sampling") + in self.__GENERATION_STRATEGIES + ), f"Generation strategy {kwargs.get('generation_strategy', 'samlping')} not supported by {self.__class__.__name__}" + self.generate_settings = self.__GENERATION_STRATEGIES[ + kwargs.get("generation_strategy", "samlping") + ] + self.generate_settings.max_new_tokens = kwargs.get( + "max_new_tokens", GenerateSettings.max_new_tokens + ) + self.generate_settings.num_return_sequences = kwargs.get( + "num_return_sequences", GenerateSettings.num_return_sequences + ) + self.generate_settings.num_beams = kwargs.get( + "num_beams", GenerateSettings.num_beams + ) + self.generate_settings.temperature = kwargs.get( + "temperature", GenerateSettings.temperature + ) + + def __load_model(self, **kwargs): + # Setup environment + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.context_size = 16384 + + # Setup kwargs + kwargs = dict( + torch_dtype=torch.bfloat16, + ) + + # Load the model and tokenizer + with self.__MODELS_LOCK: + if self.__MODELS_LOADED: + return + + # Load tokenizer + self.__TOKENIZER = AutoTokenizer.from_pretrained(self.model_name) + self.__TOKENIZER.pad_token = self.__TOKENIZER.eos_token + # Load model + self.__MODEL = AutoModelForCausalLM.from_pretrained( + self.model_name, device_map="auto", **kwargs + ) + # Load LoRA adapter (if requested) + if kwargs.get("adapter_name", None) is not None: + self.__MODEL = PeftModel.from_pretrained( + self.__MODEL, kwargs.get("adapter_name") + ) + self.__MODEL.merge_and_unload() + self.__MODEL.eval() + self.__MODELS_LOADED = True + + def __format_prompt(self, prompt: str) -> str: + return f"[INST] {prompt} [\\INST]" + + def _generate_impl(self, prompt: str) -> Any: + formatted_prompt = self.__format_prompt(prompt) + + input_ids = self.__TOKENIZER(formatted_prompt, return_tensors="pt")["input_ids"] + input_ids = input_ids.to(self.device) + + max_length = self.generate_settings.max_new_tokens + input_ids.shape[1] + if max_length > self.context_size: + logging.warning( + "warning: max_length %s is greater than the context window %s" + % (max_length, self.context_size) + ) + return None + + with torch.no_grad(): + generated_ids = self.__MODEL.generate( + input_ids, + max_new_tokens=self.generate_settings.max_new_tokens, + num_beams=self.generate_settings.num_beams, + num_return_sequences=self.generate_settings.num_return_sequences, + early_stopping=True, + do_sample=self.generate_settings.do_sample, + temperature=self.generate_settings.temperature, + ) + + input_len = input_ids.shape[1] + fillings_ids = generated_ids[:, input_len:] + fillings = self.__TOKENIZER.batch_decode(fillings_ids, skip_special_tokens=True) + + return list(fillings) diff --git a/elleelleaime/generate/strategies/registry.py b/elleelleaime/generate/strategies/registry.py index ec3edf5c..e8910726 100644 --- a/elleelleaime/generate/strategies/registry.py +++ b/elleelleaime/generate/strategies/registry.py @@ -2,8 +2,11 @@ from elleelleaime.generate.strategies.models.openai.openai import ( OpenAIChatCompletionModels, ) -from elleelleaime.generate.strategies.models.huggingface.codellama import ( - CodeLlamaHFModels, +from elleelleaime.generate.strategies.models.huggingface.codellama.codellama_infilling import ( + CodeLLaMAInfilling, +) +from elleelleaime.generate.strategies.models.huggingface.codellama.codellama_instruct import ( + CodeLLaMAIntruct, ) from typing import Tuple @@ -21,8 +24,26 @@ class PatchGenerationStrategyRegistry: "gpt-3.5-turbo": (OpenAIChatCompletionModels, ("gpt-3.5-turbo",)), "gpt-4o-mini": (OpenAIChatCompletionModels, ("gpt-4o-mini",)), # HuggingFace models - "codellama-7b": (CodeLlamaHFModels, ("codellama/CodeLlama-7b-hf",)), - "codellama-13b": (CodeLlamaHFModels, ("codellama/CodeLlama-13b-hf",)), + "codellama-7b": (CodeLLaMAInfilling, ("meta-llama/CodeLlama-7b-hf",)), + "codellama-13b": (CodeLLaMAInfilling, ("meta-llama/CodeLlama-13b-hf",)), + "codellama-34b": (CodeLLaMAInfilling, ("meta-llama/CodeLlama-34b-hf",)), + "codellama-70b": (CodeLLaMAInfilling, ("meta-llama/CodeLlama-70b-hf",)), + "codellama-7b-instruct": ( + CodeLLaMAIntruct, + ("meta-llama/CodeLlama-7b-Instruct-hf",), + ), + "codellama-13b-instruct": ( + CodeLLaMAIntruct, + ("meta-llama/CodeLlama-13b-Instruct-hf",), + ), + "codellama-34b-instruct": ( + CodeLLaMAIntruct, + ("meta-llama/CodeLlama-34b-Instruct-hf",), + ), + "codellama-70b-instruct": ( + CodeLLaMAIntruct, + ("meta-llama/CodeLlama-70b-Instruct-hf",), + ), } @classmethod