Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add text-to-audio pipeline and dependencies #128

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion runner/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ COPY ./requirements.txt /app
RUN pip install --no-cache-dir -r requirements.txt

RUN pip install https://github.com/chengzeyi/stable-fast/releases/download/v1.0.3/stable_fast-1.0.3+torch211cu121-cp311-cp311-manylinux2014_x86_64.whl

# Install parler-tts separately if needed
RUN pip install --no-cache-dir git+https://github.com/huggingface/parler-tts.git
# Most DL models are quite large in terms of memory, using workers is a HUGE
# slowdown because of the fork and GIL with python.
# Using multiple pods seems like a better default strategy.
Expand Down
7 changes: 7 additions & 0 deletions runner/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ def load_pipeline(pipeline: str, model_id: str) -> any:
from app.pipelines.upscale import UpscalePipeline

return UpscalePipeline(model_id)
case "text-to-speech":
from app.pipelines.text_to_speech import TextToSpeechPipeline
return TextToSpeechPipeline(model_id)
case _:
raise EnvironmentError(
f"{pipeline} is not a valid pipeline for model {model_id}"
Expand Down Expand Up @@ -82,6 +85,10 @@ def load_route(pipeline: str) -> any:
from app.routes import upscale

return upscale.router
case "text-to-speech":
from app.routes import text_to_speech

return text_to_speech.router
case _:
raise EnvironmentError(f"{pipeline} is not a valid pipeline")

Expand Down
72 changes: 72 additions & 0 deletions runner/app/pipelines/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import uuid
from app.pipelines.base import Pipeline
from app.pipelines.utils import get_torch_device, get_model_dir
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import file_download
import torch
import soundfile as sf
import os
import logging

logger = logging.getLogger(__name__)

class TextToSpeechPipeline(Pipeline):
def __init__(self, model_id: str):
self.model_id = model_id
if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
logger.info("Mocking TextToSpeechPipeline for %s", model_id)
return

self.device = get_torch_device()
# torch_dtype = torch.bfloat16
self.model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-mini-v1", attn_implementation="eager").to(self.device)
self.tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")

# # compile the forward pass
# compile_mode = "default" # chose "reduce-overhead" for 3 to 4x speed-up
# self.model.generation_config.cache_implementation = "static"
# self.model.forward = torch.compile(self.model.forward, mode=compile_mode)

# # warmup
# inputs = self.tokenizer("This is for compilation", return_tensors="pt", padding="max_length", max_length=max_length).to(self.device)

# model_kwargs = {**inputs, "prompt_input_ids": inputs.input_ids, "prompt_attention_mask": inputs.attention_mask, }

# n_steps = 1 if compile_mode == "default" else 2
# for _ in range(n_steps):
# _ = self.model.generate(**model_kwargs)



def __call__(self, text, description):
if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
unique_audio_filename = f"{uuid.uuid4()}.wav"
audio_path = os.path.join("/tmp/", unique_audio_filename)
sf.write(audio_path, [0] * 22050, samplerate=22050)
return audio_path
unique_audio_filename = f"{uuid.uuid4()}.wav"
audio_path = os.path.join("/tmp/", unique_audio_filename)

self.generate_audio(text, description, audio_path)

return audio_path

def generate_audio(self,
text,
description="A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up.",
output_file_name="tmp.mp4"):
if description == '':
description = "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch. The recording is of very high quality, with the speaker's voice sounding clear and very close up."

input_ids = self.tokenizer(description, return_tensors="pt").input_ids.to(self.device)
prompt_input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(self.device)

generation = self.model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
sf.write(output_file_name, audio_arr, self.model.config.sampling_rate)
return output_file_name

def __str__(self) -> str:
return f"TextToSpeechPipeline model_id={self.model_id}"

1 change: 1 addition & 0 deletions runner/app/pipelines/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,3 +171,4 @@ def check_nsfw_images(
clip_input=safety_checker_input.pixel_values.to(self._dtype),
)
return images, has_nsfw_concept

70 changes: 70 additions & 0 deletions runner/app/routes/text_to_speech.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Annotated
from fastapi import Depends, APIRouter, Form
from fastapi.responses import FileResponse, JSONResponse
from pydantic import BaseModel
from app.pipelines.base import Pipeline
from app.routes.util import AudioResponse
from app.dependencies import get_pipeline
import logging
import os

class HTTPError(BaseModel):
detail: str

router = APIRouter()

logger = logging.getLogger(__name__)

responses = {
400: {"content": {"application/json": {"schema": HTTPError.schema()}}},
500: {"content": {"application/json": {"schema": HTTPError.schema()}}},
200: {
"content": {
"audio/mp4": {},
}
}
}

class TextToSpeechParams(BaseModel):
text_input: Annotated[str, Form()] = ""
description: Annotated[str, Form()] = ""
model_id: str = ""


@router.post("/text-to-speech",
response_model=AudioResponse,
responses=responses)
async def text_to_speech(
params: TextToSpeechParams,
pipeline: Pipeline = Depends(get_pipeline),
):

try:
if not params.text_input:
raise ValueError("text_input is required and cannot be empty.")

result = pipeline(params.text_input, params.description)

except ValueError as ve:
logger.error(f"Validation error: {ve}")
return JSONResponse(
status_code=400,
content={"detail": str(ve)},
)

except Exception as e:
logger.error(f"TextToSpeechPipeline error: {e}")
return JSONResponse(
status_code=500,
content={"detail": f"Internal Server Error: {str(e)}"},
)

if os.path.exists(result):
return FileResponse(path=result, media_type='audio/mp4', filename="generated_audio.mp4")
else:
return JSONResponse(
status_code=400,
content={
"detail": f"no output found for {result}"
},
)
12 changes: 12 additions & 0 deletions runner/app/routes/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,18 @@ class ImageResponse(BaseModel):
class VideoResponse(BaseModel):
frames: List[List[Media]]

class AudioResponse(BaseModel):
audio: Media

class chunk(BaseModel):
timestamp: tuple
text: str


class TextResponse(BaseModel):
text: str
chunks: List[chunk]


class chunk(BaseModel):
timestamp: tuple
Expand Down
4 changes: 2 additions & 2 deletions runner/dev/Dockerfile.debug
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM livepeer/ai-runner:base
RUN pip install debugpy

# Expose the debugpy port and start the app as usual.
CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
# CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

# If you want to wait for the debugger to attach before starting the app, use the --wait-for-client option.
# CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["python", "-m", "debugpy", "--listen", "0.0.0.0:5678", "--wait-for-client", "-m", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
75 changes: 51 additions & 24 deletions runner/dev/patches/debug.patch
Original file line number Diff line number Diff line change
@@ -1,25 +1,52 @@
diff --git a/worker/docker.go b/worker/docker.go
index e7dcca1..7ad026a 100644
--- a/worker/docker.go
+++ b/worker/docker.go
@@ -148,6 +148,7 @@ func (m *DockerManager) createContainer(ctx context.Context, pipeline string, mo
},
ExposedPorts: nat.PortSet{
containerPort: struct{}{},
+ "5678/tcp": struct{}{},
},
Labels: map[string]string{
containerCreatorLabel: containerCreator,
@@ -176,6 +177,12 @@ func (m *DockerManager) createContainer(ctx context.Context, pipeline string, mo
HostPort: containerHostPort,
},
},
+ "5678/tcp": []nat.PortBinding{
+ {
+ HostIP: "0.0.0.0",
+ HostPort: "5678",
+ },
+ },
},
}
--- app/pipelines/text_to_speech.py 2024-08-02 20:39:18.658448901 +0000
+++ app/pipelines/text_to_speech_updated.py 2024-08-02 20:39:02.304028206 +0000
@@ -12,21 +12,21 @@
class TextToSpeechPipeline(Pipeline):
def __init__(self, model_id: str):
self.model_id = model_id
- # kwargs = {"cache_dir": get_model_dir()}
+ if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
+ logger.info("Mocking TextToSpeechPipeline for %s", model_id)
+ return

- # folder_name = file_download.repo_folder_name(
- # repo_id=model_id, repo_type="model"
- # )
- # folder_path = os.path.join(get_model_dir(), folder_name)
self.device = get_torch_device()
- # preload FastSpeech 2 & hifigan
self.TTS_tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer", cache_dir=get_model_dir())
self.TTS_model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer", cache_dir=get_model_dir()).to(self.device)
self.TTS_hifigan = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan", cache_dir=get_model_dir()).to(self.device)

-
def __call__(self, text):
- # generate unique filename
+ if os.getenv("MOCK_PIPELINE", "").strip().lower() == "true":
+ unique_audio_filename = f"{uuid.uuid4()}.wav"
+ audio_path = os.path.join("/tmp/", unique_audio_filename)
+ sf.write(audio_path, [0] * 22050, samplerate=22050)
+ return audio_path
unique_audio_filename = f"{uuid.uuid4()}.wav"
audio_path = os.path.join("/tmp/", unique_audio_filename)

@@ -35,19 +35,11 @@
return audio_path

def generate_audio(self, text, output_file_name):
- # Tokenize input text
inputs = self.TTS_tokenizer(text, return_tensors="pt").to(self.device)
-
- # Ensure input IDs remain in Long tensor type
input_ids = inputs["input_ids"].to(self.device)
-
- # Generate spectrogram
output_dict = self.TTS_model(input_ids, return_dict=True)
spectrogram = output_dict["spectrogram"]
-
- # Convert spectrogram to waveform
waveform = self.TTS_hifigan(spectrogram)
-
sf.write(output_file_name, waveform.squeeze().detach().cpu().numpy(), samplerate=22050)
return output_file_name

7 changes: 7 additions & 0 deletions runner/dl_checkpoints.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,11 @@ function download_alpha_models() {

# Download upscale models
huggingface-cli download stabilityai/stable-diffusion-x4-upscaler --include "*.fp16.safetensors" --cache-dir models


# Download FastSpeech 2 and HiFi-GAN models
huggingface-cli download facebook/fastspeech2-en-ljspeech --include "*.bin" "*.json" --cache-dir models/fastspeech2
huggingface-cli download facebook/hifigan --include "*.bin" "*.json" --cache-dir models/hifigan

# Download audio-to-text models.
huggingface-cli download openai/whisper-large-v3 --include "*.safetensors" "*.json" --cache-dir models
Expand All @@ -39,6 +44,8 @@ function download_alpha_models() {
# Download image-to-video models (token-gated).
check_hf_auth
huggingface-cli download stabilityai/stable-video-diffusion-img2vid-xt-1-1 --include "*.fp16.safetensors" "*.json" --cache-dir models ${TOKEN_FLAG:+"$TOKEN_FLAG"}


}

# Download all models.
Expand Down
12 changes: 10 additions & 2 deletions runner/gen_openapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,15 @@

import yaml
from app.main import app, use_route_names_as_operation_ids
from app.routes import (audio_to_text, health, image_to_image, image_to_video,
text_to_image, upscale)
from app.routes import (
audio_to_text,
text_to_speech,
health,
image_to_image,
image_to_video,
text_to_image,
upscale,
)
from fastapi.openapi.utils import get_openapi

# Specify Endpoints for OpenAPI schema generation.
Expand Down Expand Up @@ -79,6 +86,7 @@ def write_openapi(fname, entrypoint="runner"):
app.include_router(image_to_video.router)
app.include_router(upscale.router)
app.include_router(audio_to_text.router)
app.include_router(text_to_speech.router)

use_route_names_as_operation_ids(app)

Expand Down
Loading