Go语言 热门使用coqui-ai TTS将文本文件转换为音频?

oipij1gg  于 2023-04-18  发布在  Go
关注(0)|答案(1)|浏览(430)

我想使用coqui-ai TTS,但它在处理短句时有问题,所以我做了一个go脚本,只保留转换时不会给予的句子,然后合并所有音频。
剧本在大部分情况下都很好,但在小句子上,它会发出奇怪的声音,通常会产生比它应该的更长的音频。
我认为最好的方法是计算出每个字符的平均音频长度,如果一个句子的音频长度超过预期长度一定的幅度,就把多余的部分截断,我在go中怎么做呢?
这是密码

package main

import (
    "log"
    "os"
    "os/exec"
    "strconv"

    "github.com/cheggaaa/pb/v3"
    "github.com/neurosnap/sentences/english"
)

func main() {
    if len(os.Args) != 2 {
        log.Fatalf("Usage: go run main.go <input>")
    }
    sentences := get_sentences()
    audio_files := convert_text_to_audio(sentences)
    concatenate_audio_files(audio_files)
}

func get_sentences() []string {
    tokenizer, err := english.NewSentenceTokenizer(nil)
    if err != nil {
        panic(err)
    }
    text, err := os.ReadFile(os.Args[1])
    if err != nil {
        log.Fatal(err)
    }
    tmp := tokenizer.Tokenize(string(text))
    var sentences []string
    for _, sentence := range tmp {
        sentences = append(sentences, sentence.Text)
    }
    return sentences
}

func convert_text_to_audio(sentences []string) []string {
    var audio_files []string
    bar := pb.StartNew(len(sentences))
    for i, sentence := range sentences {
        audio_file := "out_" + strconv.Itoa(i) + ".wav"
        cmd := exec.Command("tts", "--text", sentence, "--model_name", "tts_models/en/ljspeech/tacotron2-DDC", "--out_path", audio_file)
        err := cmd.Run()
        if err != nil {
            log.Println(cmd.String())
            log.Println("Failed to run coqui-ai tts")
        } else {
            audio_files = append(audio_files, audio_file)
        }
        bar.Increment()
    }
    bar.Finish()
    return audio_files
}

func concatenate_audio_files(audio_files []string) {
    audio_files = append(audio_files, "out.wav")
    cmd := exec.Command("sox", audio_files...)
    err := cmd.Run()
    if err != nil {
        log.Fatalf("Failed to run sox")
    }
}
2lpgd968

2lpgd9681#

您可以使用此代码保存wav并运行它

model_name = "tts_models/en/vctk/vits"

    # Init TTS
    tts = TTS(model_name)

def play_wav():
    # Open the input WAV file
    with wave.open("output.wav", "rb") as f:
        # Get the sample rate and number of channels
        sample_rate = f.getframerate()
        num_channels = f.getnchannels()
        # Read the entire waveform into memory
        frames = f.readframes(f.getnframes())

    # Convert the waveform to an AudioSegment object
    audio = AudioSegment(
        data=frames,
        sample_width=f.getsampwidth(),
        frame_rate=sample_rate,
        channels=num_channels
    )

    # Increase the speed by a factor of X
    new_audio = audio.speedup(playback_speed=1.05)

    # Play the modified audio
    play(new_audio)

    tts.tts_to_file(text="Hi this a test", speaker="p243", file_path="output.wav")
    play_wav()

相关问题