livepeer · pschroedl · Jul 4, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
@@ -38,7 +38,8 @@ COPY ./requirements.txt /app
 RUN pip install --no-cache-dir -r requirements.txt
 
 RUN pip install https://github.com/chengzeyi/stable-fast/releases/download/v1.0.3/stable_fast-1.0.3+torch211cu121-cp311-cp311-manylinux2014_x86_64.whl
-
+# Install parler-tts separately if needed
+RUN pip install --no-cache-dir git+https://github.com/huggingface/parler-tts.git
 # Most DL models are quite large in terms of memory, using workers is a HUGE
 # slowdown because of the fork and GIL with python.
 # Using multiple pods seems like a better default strategy.

@@ -52,6 +52,9 @@ def load_pipeline(pipeline: str, model_id: str) -> any:
             from app.pipelines.upscale import UpscalePipeline
 
             return UpscalePipeline(model_id)
+        case "text-to-speech":
+            from app.pipelines.text_to_speech import TextToSpeechPipeline
+            return TextToSpeechPipeline(model_id)
         case _:
             raise EnvironmentError(
                 f"{pipeline} is not a valid pipeline for model {model_id}"
@@ -82,6 +85,10 @@ def load_route(pipeline: str) -> any:
             from app.routes import upscale
 
             return upscale.router
+        case "text-to-speech":
+            from app.routes import text_to_speech
+
+            return text_to_speech.router
         case _:
             raise EnvironmentError(f"{pipeline} is not a valid pipeline")
 

@@ -0,0 +1,72 @@
+import uuid
+from app.pipelines.base import Pipeline
+from app.pipelines.utils import get_torch_device, get_model_dir
+from parler_tts import ParlerTTSForConditionalGeneration
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from huggingface_hub import file_download
+import torch
+import soundfile as sf
+import os
+import logging
+
+logger = logging.getLogger(__name__)
+
+class TextToSpeechPipeline(Pipeline):
+    def __init__(self, model_id: str):
+        self.model_id = model_id
+        if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
+            logger.info("Mocking TextToSpeechPipeline for %s", model_id)
+            return
+
+        self.device = get_torch_device()
+        # torch_dtype = torch.bfloat16
+        self.model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1", attn_implementation="eager").to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
+
+        # # compile the forward pass
+        # compile_mode = "default" # chose "reduce-overhead" for 3 to 4x speed-up
+        # self.model.generation_config.cache_implementation = "static"
+        # self.model.forward = torch.compile(self.model.forward, mode=compile_mode)
+
+        # # warmup
+        # inputs = self.tokenizer("This is for compilation", return_tensors="pt", padding="max_length", max_length=max_length).to(self.device)
+
+        # model_kwargs = {**inputs, "prompt_input_ids": inputs.input_ids, "prompt_attention_mask": inputs.attention_mask, }
+
+        # n_steps = 1 if compile_mode == "default" else 2
+        # for _ in range(n_steps):
+        #     _ = self.model.generate(**model_kwargs)
+
+
+
+    def __call__(self, text, description):
+        if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
+            unique_audio_filename = f"{uuid.uuid4()}.wav"
+            audio_path = os.path.join("/tmp/", unique_audio_filename)
+            sf.write(audio_path, [0] * 22050, samplerate=22050)
+            return audio_path
+        unique_audio_filename = f"{uuid.uuid4()}.wav"
+        audio_path = os.path.join("/tmp/", unique_audio_filename)
+
+        self.generate_audio(text, description, audio_path)
+
+        return audio_path
+
+    def generate_audio(self,
+                        text,
+                        description="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.",
+                        output_file_name="tmp.mp4"):
+        if description == '':
+            description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."
+
+        input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.device)
+        prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)
+
+        generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
+        audio_arr = generation.cpu().numpy().squeeze()
+        sf.write(output_file_name, audio_arr, self.model.config.sampling_rate)
+        return output_file_name
+
+    def __str__(self) -> str:
+        return f"TextToSpeechPipeline model_id={self.model_id}"
+
@@ -171,3 +171,4 @@ def check_nsfw_images(
             clip_input=safety_checker_input.pixel_values.to(self._dtype),
         )
         return images, has_nsfw_concept
+
@@ -0,0 +1,70 @@
+from typing import Annotated
+from fastapi import Depends, APIRouter, Form
+from fastapi.responses import FileResponse, JSONResponse
+from pydantic import BaseModel
+from app.pipelines.base import Pipeline
+from app.routes.util import AudioResponse
+from app.dependencies import get_pipeline
+import logging
+import os
+
+class HTTPError(BaseModel):
+    detail: str
+
+router = APIRouter()
+
+logger = logging.getLogger(__name__)
+
+responses = {
+    400: {"content": {"application/json": {"schema": HTTPError.schema()}}},
+    500: {"content": {"application/json": {"schema": HTTPError.schema()}}},
+    200: {
+        "content": {
+            "audio/mp4": {},
+        }
+    }
+}
+
+class TextToSpeechParams(BaseModel):
+    text_input: Annotated[str, Form()] = ""
+    description: Annotated[str, Form()] = ""
+    model_id: str = ""
+
+
+@router.post("/text-to-speech",
+    response_model=AudioResponse,
+    responses=responses)
+async def text_to_speech(
+    params: TextToSpeechParams,
+    pipeline: Pipeline = Depends(get_pipeline),
+):
+
+    try:
+        if not params.text_input:
+            raise ValueError("text_input is required and cannot be empty.")
+
+        result = pipeline(params.text_input, params.description)
+
+    except ValueError as ve:
+        logger.error(f"Validation error: {ve}")
+        return JSONResponse(
+            status_code=400,
+            content={"detail": str(ve)},
+        )
+
+    except Exception as e:
+        logger.error(f"TextToSpeechPipeline error: {e}")
+        return JSONResponse(
+            status_code=500,
+            content={"detail": f"Internal Server Error: {str(e)}"},
+        )
+
+    if os.path.exists(result):
+        return FileResponse(path=result, media_type='audio/mp4', filename="generated_audio.mp4")
+    else:
+        return JSONResponse(
+            status_code=400,
+            content={
+                "detail": f"no output found for {result}"
+            },
+        )
@@ -23,6 +23,18 @@ class ImageResponse(BaseModel):
 class VideoResponse(BaseModel):
     frames: List[List[Media]]
 
+class AudioResponse(BaseModel):
+    audio: Media
+
+class chunk(BaseModel):
+    timestamp: tuple
+    text: str
+
+
+class TextResponse(BaseModel):
+    text: str
+    chunks: List[chunk]
+
 
 class chunk(BaseModel):
     timestamp: tuple

@@ -5,7 +5,7 @@ FROM livepeer/ai-runner:base
 RUN pip install debugpy
 
 # Expose the debugpy port and start the app as usual.
-CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+# CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
 
 # If you want to wait for the debugger to attach before starting the app, use the --wait-for-client option.
-# CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
@@ -1,25 +1,52 @@
-diff --git a/worker/docker.go b/worker/docker.go
-index e7dcca1..7ad026a 100644
---- a/worker/docker.go
-+++ b/worker/docker.go
-@@ -148,6 +148,7 @@ func (m *DockerManager) createContainer(ctx context.Context, pipeline string, mo
- 		},
- 		ExposedPorts: nat.PortSet{
- 			containerPort: struct{}{},
-+			"5678/tcp":    struct{}{},
- 		},
- 		Labels: map[string]string{
- 			containerCreatorLabel: containerCreator,
-@@ -176,6 +177,12 @@ func (m *DockerManager) createContainer(ctx context.Context, pipeline string, mo
- 					HostPort: containerHostPort,
- 				},
- 			},
-+			"5678/tcp": []nat.PortBinding{
-+				{
-+					HostIP:   "0.0.0.0",
-+					HostPort: "5678",
-+				},
-+			},
- 		},
- 	}
+--- app/pipelines/text_to_speech.py	2024-08-02 20:39:18.658448901 +0000
++++ app/pipelines/text_to_speech_updated.py	2024-08-02 20:39:02.304028206 +0000
+@@ -12,21 +12,21 @@
+ class TextToSpeechPipeline(Pipeline):
+     def __init__(self, model_id: str):
+         self.model_id = model_id
+-        # kwargs = {"cache_dir": get_model_dir()}
++        if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
++            logger.info("Mocking TextToSpeechPipeline for %s", model_id)
++            return
 
+-        # folder_name = file_download.repo_folder_name(
+-        #     repo_id=model_id, repo_type="model"
+-        # )
+-        # folder_path = os.path.join(get_model_dir(), folder_name)
+         self.device = get_torch_device()
+-        # preload FastSpeech 2 & hifigan
+         self.TTS_tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer", cache_dir=get_model_dir())
+         self.TTS_model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer", cache_dir=get_model_dir()).to(self.device)
+         self.TTS_hifigan = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan", cache_dir=get_model_dir()).to(self.device)
+
+-
+     def __call__(self, text):
+-        # generate unique filename
++        if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
++            unique_audio_filename = f"{uuid.uuid4()}.wav"
++            audio_path = os.path.join("/tmp/", unique_audio_filename)
++            sf.write(audio_path, [0] * 22050, samplerate=22050)
++            return audio_path
+         unique_audio_filename = f"{uuid.uuid4()}.wav"
+         audio_path = os.path.join("/tmp/", unique_audio_filename)
+
+@@ -35,19 +35,11 @@
+         return audio_path
+
+     def generate_audio(self, text, output_file_name):
+-        # Tokenize input text
+         inputs = self.TTS_tokenizer(text, return_tensors="pt").to(self.device)
+-        
+-        # Ensure input IDs remain in Long tensor type
+         input_ids = inputs["input_ids"].to(self.device)
+-        
+-        # Generate spectrogram
+         output_dict = self.TTS_model(input_ids, return_dict=True)
+         spectrogram = output_dict["spectrogram"]
+-
+-        # Convert spectrogram to waveform
+         waveform = self.TTS_hifigan(spectrogram)
+-
+         sf.write(output_file_name, waveform.squeeze().detach().cpu().numpy(), samplerate=22050)
+         return output_file_name
+
@@ -30,6 +30,11 @@ function download_alpha_models() {
 
     # Download upscale models
     huggingface-cli download stabilityai/stable-diffusion-x4-upscaler --include "*.fp16.safetensors" --cache-dir models
+
+
+    # Download FastSpeech 2 and HiFi-GAN models
+    huggingface-cli download facebook/fastspeech2-en-ljspeech --include "*.bin" "*.json" --cache-dir models/fastspeech2
+    huggingface-cli download facebook/hifigan --include "*.bin" "*.json" --cache-dir models/hifigan
 
     # Download audio-to-text models.
     huggingface-cli download openai/whisper-large-v3 --include "*.safetensors" "*.json" --cache-dir models
@@ -39,6 +44,8 @@ function download_alpha_models() {
     # Download image-to-video models (token-gated).
     check_hf_auth
     huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt-1-1 --include "*.fp16.safetensors" "*.json" --cache-dir models ${TOKEN_FLAG:+"$TOKEN_FLAG"}
+
+
 }
 
 # Download all models.

@@ -5,8 +5,15 @@
 
 import yaml
 from app.main import app, use_route_names_as_operation_ids
-from app.routes import (audio_to_text, health, image_to_image, image_to_video,
-                        text_to_image, upscale)
+from app.routes import (
+    audio_to_text,
+    text_to_speech,
+    health,
+    image_to_image,
+    image_to_video,
+    text_to_image,
+    upscale,
+)
 from fastapi.openapi.utils import get_openapi
 
 # Specify Endpoints for OpenAPI schema generation.
@@ -79,6 +86,7 @@ def write_openapi(fname, entrypoint="runner"):
     app.include_router(image_to_video.router)
     app.include_router(upscale.router)
     app.include_router(audio_to_text.router)
+    app.include_router(text_to_speech.router)
 
     use_route_names_as_operation_ids(app)