这将是相当复杂的,所以希望我能解释它的权利。
我正在使用OpenAI聊天API来尝试制作一个虚拟助手。我所做的是提交一个查询,然后通过API运行,响应是在生成时以块的形式流回客户端的数据,以减少明显的响应时间。
我现在正在尝试做的是使用ElevenLabs语音API生成文本到语音,因为OpenAI数据流正在生成。我不知道这是否可能使用ElevenLabs,或任何TTS服务的方式,但如果有人能帮助我,我会非常感激。
服务器端
// Dependencies
const express = require("express");
const app = express();
const cors = require("cors");
const server = require("http").Server(app);
const { Configuration, OpenAIApi } = require("openai");
const { OpenAIExt } = require("openai-ext");
const voice = require('elevenlabs-node');
const fs = require('fs');
// Declare ejs, json formatting, set static files folder and initialise CORS.
app.set("view engine", "ejs");
app.set("json spaces", 2);
app.use(express.static("public"));
app.use(cors());
// Set the parser settings for JSON.
app.use(express.urlencoded({ extended: false }));
app.use(express.json());
// OpenAI Config
const configuration = new Configuration({
apiKey: "",
});
const openai = new OpenAIApi(configuration);
// Set up Elevenlabs voice API
const apiKey = ''; // Your API key from Elevenlabs
const voiceID = 'pNInz6obpgDQGcFmaJgB'; // The ID of the voice you want to get
const fileName = 'public/speech.mp3'; // The name of your audio file
// Configure the stream (use type ServerStreamChatCompletionConfig for TypeScript users)
const streamConfig = {
openai: openai,
handler: {
// Content contains the string draft, which may be partial. When isFinal is true, the completion is done.
onContent(content, isFinal, stream) {
console.log(content, "isFinal?", isFinal);
},
onDone(stream) {
// console.log("Done!");
stream.destroy();
},
onError(error, stream) {
console.error(error);
},
},
};
// Set up SSE route for updates
app.get("/updates", (req, res) => {
res.setHeader("Content-Type", "text/event-stream");
res.setHeader("Cache-Control", "no-cache");
res.setHeader("Connection", "keep-alive");
// Send a comment to indicate that the connection was successful
res.write(": connected\n\n");
// Set up event listener for onContent updates
streamConfig.handler.onContent = (content, isFinal, stream) => {
try {
const data = JSON.stringify({ content, isFinal });
// Send the update to the client as an SSE event
res.write(`event: update\ndata: ${data}\n\n`);
if (isFinal == true) {
voice.textToSpeech(apiKey, voiceID, fileName, content).then(res => { // This is the closest I have been able to get,
console.log(res); // But this executes only once the content is done outputting
}); // And still doesnt really work
}
} catch (error) {
console.error("Error sending update:", error);
res.status(500).end();
}
};
streamConfig.handler.onDone = (stream) => {
// console.log("Done!");
stream.destroy();
res.end();
}
streamConfig.handler.onError = (error, stream) => {
console.error("Big bad error: " + error);
};
// Handle any errors that might occur while setting up the stream
streamConfig.handler.onError = (error) => {
console.error("Error setting up stream:", error);
res.status(500).end();
};
});
app.post("/openai", async (req, res) => {
const messages = req.body.messages;
// Make the call to stream the completion
const response = await OpenAIExt.streamServerChatCompletion(
{
model: "gpt-3.5-turbo",
messages: messages,
max_tokens: 1024,
temperature: 1,
top_p: 1,
frequency_penalty: 0.0,
presence_penalty: 0.6,
},
streamConfig
);
// Send a success message back to the client
res.json({ message: "Request successful" });
});
// Check the login status of the user, then display the index.html file in the home page.
app.get("/", (req, res) => {
res.render("index");
});
app.get("/test", (req, res) => {
res.render("index copy");
});
server.listen(3000);
字符串
客户端
// script.js
// References
const queryBox = document.getElementById("query-box");
const mainContent = document.getElementById("main-content");
// Speech Recognition
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
let recognition;
let recording = false;
let speaking = false;
const skylaKeywords = ["Skylar", "Skyler", "scholar"];
const messages = [
{ role: "system", content: "You are a virtual assistant named Skyla that is designed to speak like J.A.R.V.I.S from the Iron Man Movies, and respond as such by including a bit of his sass in responses. You can refer to me as Sir if you so wish." },
{ role: "user", content: "Skyla can you speak more like Jarvis" },
{ role: "assistant", content: "Of course, Sir. Is there a specific phrase or tone you would like me to emulate? Just let me know and I'll do my best to channel my inner J.A.R.V.I.S for you." },
{ role: "user", content: "I want you to speak like Jarvis from the first Iron Man movie, incorporating just a bit more of his Sass in responses" },
];
let questionNumber = 0;
// Start Recognition on page load.
addEventListener("load", (e) => {
speechToText();
});
queryBox.addEventListener("keyup", function (e) {
if (e.key === "Enter" && !e.shiftKey) {
e.preventDefault();
document.getElementById("submit-query").click();
}
});
function submitQuery() {
fetchResponse(queryBox.value);
queryBox.style.height = "55px";
queryBox.value = "";
}
const source = new EventSource("/updates");
source.addEventListener("open", () => {
console.log("Connection to updates endpoint opened");
});
source.addEventListener("update", (event) => {
const { content, isFinal } = JSON.parse(event.data);
const queryBox = document.getElementById(questionNumber);
speaking = true;
mainContent.scrollTop = mainContent.scrollHeight;
// Update the element with the new content
if (queryBox != null) {
queryBox.innerHTML = "<img src='icons/skyla.png'><div><p>" + content + "</p></div>";
}
if (isFinal) {
console.log("Completion finished");
const audio = new Audio(audioFile);
audio.play();
messages.push({ role: "assistant", content: content });
questionNumber += 1;
speaking = false;
}
});
// Convert speech to text
function speechToText() {
try {
// Initialise Speech Recognition
recognition = new SpeechRecognition();
recognition.lang = "en";
recognition.interimResults = true;
// Start Recognition
recognition.start();
recognition.onresult = (event) => {
let speech = event.results[0][0].transcript;
// Replace 'Skylar, Skyler or Scholar' with Skyla
skylaKeywords.forEach((keyword) => {
if (speech === keyword && !recording) {
speech = speech.replaceAll(keyword, "Skyla");
queryBox.classList.add("recording");
recording = true;
}
});
console.log(speech);
// Detect the final speech result.
if (event.results[0].isFinal && recording && speaking == false) {
let newSpeech = speech;
skylaKeywords.forEach((keyword) => {
if (speech.includes(keyword)) {
newSpeech = speech.replaceAll(keyword, "Skyla");
}
});
fetchResponse(newSpeech);
}
};
recognition.onspeechend = () => {
speechToText();
};
recognition.onerror = (event) => {
stopRecording();
switch (event.error) {
case "no-speech":
speechToText();
break;
case "audio-capture":
alert("No microphone was found. Ensure that a microphone is installed.");
break;
case "not-allowed":
alert("Permission to use microphone is blocked.");
break;
case "aborted":
alert("Listening Stopped.");
break;
default:
alert("Error occurred in recognition: " + event.error);
break;
}
};
} catch (error) {
recording = false;
console.log(error);
}
}
function fetchResponse(content) {
// Append the speech to the main-content div.
const newInputElement = document.createElement("div");
newInputElement.classList = "user-input content-box";
newInputElement.innerHTML = "<img src='icons/avatar.png'><div><p>" + content + "</p></div>";
mainContent.append(newInputElement);
mainContent.scrollTop = mainContent.scrollHeight;
messages.push({ role: "user", content: content });
// fetch to the api.
fetch("/openai", {
method: "POST",
headers: {
"Content-Type": "application/json",
},
body: JSON.stringify({ messages: messages, }),
}).then((response) => response.json())
.then((data) => {
// Append the speech to the main-content div.
const newResponseElement = document.createElement("div");
newResponseElement.classList = "skyla-response content-box";
newResponseElement.id = questionNumber;
newResponseElement.innerHTML = "<img src='icons/skyla.png'><p>" + data.data + "</p>";
mainContent.append(newResponseElement);
})
.catch((error) => console.error(error));
}
// Stop Voice Recognition
function stopRecording() {
queryBox.classList.remove("recording");
recording = false;
}
型
1条答案
按热度按时间kzipqqlq1#
可能有两种方法--
1.也许你可以在文本响应中得到一个句号(.)时调用11 labs API。这样你至少会得到一个句子的音频。并在两者都准备好时将音频以及文本数据发送回客户端。
1.您也可以从客户端本身调用11 labs。这样你就不会打扰GPT的React,你会继续得到,随着你将调用11 labs从前端,当你收到一个完整的句子。
在这种方法中,您可能无法获得最佳的音频效果,但它应该可以工作。