NodeJS 如何从实时更新的OpenAI数据流生成文本到语音

jobtbby3  于 2023-08-04  发布在  Node.js
关注(0)|答案(1)|浏览(223)

这将是相当复杂的,所以希望我能解释它的权利。
我正在使用OpenAI聊天API来尝试制作一个虚拟助手。我所做的是提交一个查询,然后通过API运行,响应是在生成时以块的形式流回客户端的数据,以减少明显的响应时间。
我现在正在尝试做的是使用ElevenLabs语音API生成文本到语音,因为OpenAI数据流正在生成。我不知道这是否可能使用ElevenLabs,或任何TTS服务的方式,但如果有人能帮助我,我会非常感激。

服务器端

// Dependencies
const express = require("express");
const app = express();
const cors = require("cors");
const server = require("http").Server(app);
const { Configuration, OpenAIApi } = require("openai");
const { OpenAIExt } = require("openai-ext");
const voice = require('elevenlabs-node');
const fs = require('fs');

// Declare ejs, json formatting, set static files folder and initialise CORS.
app.set("view engine", "ejs");
app.set("json spaces", 2);
app.use(express.static("public"));
app.use(cors());

// Set the parser settings for JSON.
app.use(express.urlencoded({ extended: false }));
app.use(express.json());

// OpenAI Config
const configuration = new Configuration({
    apiKey: "",
});
const openai = new OpenAIApi(configuration);

// Set up Elevenlabs voice API
const apiKey = '';      // Your API key from Elevenlabs
const voiceID = 'pNInz6obpgDQGcFmaJgB';                 // The ID of the voice you want to get
const fileName = 'public/speech.mp3';                   // The name of your audio file

// Configure the stream (use type ServerStreamChatCompletionConfig for TypeScript users)
const streamConfig = {
    openai: openai,

    handler: {
        // Content contains the string draft, which may be partial. When isFinal is true, the completion is done.
        onContent(content, isFinal, stream) {
            console.log(content, "isFinal?", isFinal);
        },
        onDone(stream) {
          //  console.log("Done!");
            stream.destroy();
        },
        onError(error, stream) {    
            console.error(error);
        },
    },
};

// Set up SSE route for updates
app.get("/updates", (req, res) => {
    res.setHeader("Content-Type", "text/event-stream");
    res.setHeader("Cache-Control", "no-cache");
    res.setHeader("Connection", "keep-alive");
  
    // Send a comment to indicate that the connection was successful
    res.write(": connected\n\n");
  
    // Set up event listener for onContent updates
    streamConfig.handler.onContent = (content, isFinal, stream) => {
        try {
            const data = JSON.stringify({ content, isFinal });
      
            // Send the update to the client as an SSE event
            res.write(`event: update\ndata: ${data}\n\n`);

            if (isFinal == true) {
                voice.textToSpeech(apiKey, voiceID, fileName, content).then(res => {        // This is the closest I have been able to get,
                    console.log(res);                                                       // But this executes only once the content is done outputting
                });                                                                         // And still doesnt really work
            }

          } catch (error) {
            console.error("Error sending update:", error);
            res.status(500).end();
          }
    };
    streamConfig.handler.onDone = (stream) => {
        // console.log("Done!");        
        stream.destroy();
        res.end();
    }
    streamConfig.handler.onError = (error, stream) => {    
        console.error("Big bad error: " + error);
    };

  // Handle any errors that might occur while setting up the stream
  streamConfig.handler.onError = (error) => {
    console.error("Error setting up stream:", error);
    res.status(500).end();
  };
  });

app.post("/openai", async (req, res) => {
    const messages = req.body.messages;

    // Make the call to stream the completion
    const response = await OpenAIExt.streamServerChatCompletion(
        {
            model: "gpt-3.5-turbo",
            messages: messages,
            max_tokens: 1024,
            temperature: 1,
            top_p: 1,
            frequency_penalty: 0.0,
            presence_penalty: 0.6,
        },
        streamConfig 
    );

     // Send a success message back to the client
    res.json({ message: "Request successful" });
});

// Check the login status of the user, then display the index.html file in the home page.
app.get("/", (req, res) => {
    res.render("index");
});

app.get("/test", (req, res) => {
    res.render("index copy");
});

server.listen(3000);

字符串

客户端

// script.js

// References
const queryBox = document.getElementById("query-box");
const mainContent = document.getElementById("main-content");

// Speech Recognition
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
let recognition;
let recording = false;
let speaking = false;

const skylaKeywords = ["Skylar", "Skyler", "scholar"];

const messages = [
    { role: "system", content: "You are a virtual assistant named Skyla that is designed to speak like J.A.R.V.I.S from the Iron Man Movies, and respond as such by including a bit of his sass in responses. You can refer to me as Sir if you so wish." },
    { role: "user", content: "Skyla can you speak more like Jarvis" },
    { role: "assistant", content: "Of course, Sir. Is there a specific phrase or tone you would like me to emulate? Just let me know and I'll do my best to channel my inner J.A.R.V.I.S for you." },
    { role: "user", content: "I want you to speak like Jarvis from the first Iron Man movie, incorporating just a bit more of his Sass in responses" },
];

let questionNumber = 0;

// Start Recognition on page load.
addEventListener("load", (e) => {
    speechToText();
});

queryBox.addEventListener("keyup", function (e) {
    if (e.key === "Enter" && !e.shiftKey) {
        e.preventDefault();
        document.getElementById("submit-query").click();
    }
});

function submitQuery() {
    fetchResponse(queryBox.value);
    queryBox.style.height = "55px";
    queryBox.value = "";
}

const source = new EventSource("/updates");

source.addEventListener("open", () => {
    console.log("Connection to updates endpoint opened");
});

source.addEventListener("update", (event) => {
    const { content, isFinal } = JSON.parse(event.data);
    const queryBox = document.getElementById(questionNumber);

    speaking = true;
    mainContent.scrollTop = mainContent.scrollHeight;

    // Update the element with the new content
    if (queryBox != null) {
        queryBox.innerHTML = "<img src='icons/skyla.png'><div><p>" + content + "</p></div>";
    }

    if (isFinal) {
        console.log("Completion finished");

        const audio = new Audio(audioFile);
        audio.play();

        messages.push({ role: "assistant", content: content });
        questionNumber += 1;
        speaking = false;
    }
});

// Convert speech to text
function speechToText() {
    try {
        // Initialise Speech Recognition
        recognition = new SpeechRecognition();
        recognition.lang = "en";
        recognition.interimResults = true;

        // Start Recognition
        recognition.start();
        recognition.onresult = (event) => {
            let speech = event.results[0][0].transcript;

            // Replace 'Skylar, Skyler or Scholar' with Skyla
            skylaKeywords.forEach((keyword) => {
                if (speech === keyword && !recording) {
                    speech = speech.replaceAll(keyword, "Skyla");
                    queryBox.classList.add("recording");
                    recording = true;
                }
            });

            console.log(speech);

            // Detect the final speech result.
            if (event.results[0].isFinal && recording && speaking == false) {
                let newSpeech = speech;

                skylaKeywords.forEach((keyword) => {
                    if (speech.includes(keyword)) {
                        newSpeech = speech.replaceAll(keyword, "Skyla");
                    }
                });

                fetchResponse(newSpeech);
            }
        };
        recognition.onspeechend = () => {
            speechToText();
        };
        recognition.onerror = (event) => {
            stopRecording();

            switch (event.error) {
                case "no-speech":
                  speechToText();
                  break;
                case "audio-capture":
                  alert("No microphone was found. Ensure that a microphone is installed.");
                  break;
                case "not-allowed":
                  alert("Permission to use microphone is blocked.");
                  break;
                case "aborted":
                    alert("Listening Stopped.");
                    break;
                default:
                    alert("Error occurred in recognition: " + event.error);
                    break;
            }
        };
    } catch (error) {
        recording = false;

        console.log(error);
    }
}

function fetchResponse(content) {
    // Append the speech to the main-content div.
    const newInputElement = document.createElement("div");
    newInputElement.classList = "user-input content-box";
    newInputElement.innerHTML = "<img src='icons/avatar.png'><div><p>" + content + "</p></div>";
    mainContent.append(newInputElement);
    mainContent.scrollTop = mainContent.scrollHeight;

    messages.push({ role: "user", content: content });

    // fetch to the api.
    fetch("/openai", {
        method: "POST",
        headers: {
            "Content-Type": "application/json",
        },
        body: JSON.stringify({ messages: messages, }),
    }).then((response) => response.json())
      .then((data) => {

            // Append the speech to the main-content div.
            const newResponseElement = document.createElement("div");
            newResponseElement.classList = "skyla-response content-box";
            newResponseElement.id = questionNumber;
            newResponseElement.innerHTML = "<img src='icons/skyla.png'><p>" + data.data + "</p>";
            mainContent.append(newResponseElement);
        })
        .catch((error) => console.error(error));
}

// Stop Voice Recognition
function stopRecording() {
    queryBox.classList.remove("recording");
    recording = false;
}

kzipqqlq

kzipqqlq1#

可能有两种方法--
1.也许你可以在文本响应中得到一个句号(.)时调用11 labs API。这样你至少会得到一个句子的音频。并在两者都准备好时将音频以及文本数据发送回客户端。
1.您也可以从客户端本身调用11 labs。这样你就不会打扰GPT的React,你会继续得到,随着你将调用11 labs从前端,当你收到一个完整的句子。
在这种方法中,您可能无法获得最佳的音频效果,但它应该可以工作。

相关问题