我正在开发一个小应用程序来将音频数据流传输到google speech API.我设法从不同的堆栈溢出帖子中拼凑出以下函数来将本地音频文件转换为linear16格式.然后我通过udp将数据发送到我的服务器,在那里将数据流传输到google speech API.问题是音频没有被识别出来.我注意到,如果我在节点服务器上将Uint16Array更改为Uint8Array,我开始从服务器获得一些结果,但这是将16位数据转换为8位数据,我认为没有必要进行此转换。
发送到google speech的代码。原始数据被写入流中:
export const streamingRecognize = ({
config,
cb,
}: {
config: SpeechToText.GsConfig
cb?: (data: SpeechRecognitionResult) => void
}): Writable => {
const request = {
config,
singleUtterance: false,
interimResults: true, //Get interim results from stream
enableWordTimeOffsets: true
}
const client = new speech.SpeechClient({ credentials: keyAndEmail })
return client
.streamingRecognize(request)
.on('error', (err) => console.log(err))
.on('data', (data) => {
cb && cb(data)
})
}
export const openStream = (audioConfig, callback) =>
streamingRecognize({
config: {
languageCode: audioConfig.languageCode,
sampleRateHertz: parseInt(audioConfig.sampleRateHertz),
encoding: audioConfig.encoding as any
},
cb: (data: any) => {
callback(data)
},
})b(data)
})
const stream = openStream({
sampleRateHertz: 16000,
languageCode: 'en-US',
encoding: 'LINEAR16' as any,
}, (data) => {
console.log(data)
})
// here data is the linear16 audio data received from the ios client
const audioData = new Uint16Array(data)
const audioDataBuffer = Buffer.from(audioData)
stream.write(audioDataBuffer)
用于读取所述文件并将所述原始音频数据发送到服务器的代码。
import AVFoundation
extension Recorder {
func openFile(
_ args: NSDictionary,
resolver resolve: @escaping RCTPromiseResolveBlock,
rejecter reject: @escaping RCTPromiseRejectBlock
) {
self.delegate?.stopSignaling()
let withSignaling: Bool = (args["withSignaling"] != nil)
let fullPath = args["url"]! as! String
let userId = args["userId"] as! String
let parts = fullPath.components(separatedBy: "/")
let filename = parts[parts.count - 1]
let sendData: (_ data: UnsafeMutableRawPointer, _ byteSize: Int) -> Void = {
[weak self] data, byteSize in
guard let weakself = self else {
return
}
let imageData: Data = Data(bytes: data, count: byteSize)
weakself.delegate?.sendEvent(
withName: "fileTranscription",
body: [
"key": userId + "|" + filename,
"data": imageData.withUnsafeBytes {
Array($0.bindMemory(to: Int16.self)).map(Int16.init(bigEndian:))
}
]
)
if (withSignaling) {
weakself.delegate?.onAudioData(imageData)
}
}
let attemptToConnect: () -> Void = {
[weak self] in
guard let weakself = self else {
return
}
do {
let url = URL(fileURLWithPath: fullPath)
let data = try Data(contentsOf: url)
let format = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
_ = data.convertedTo(format, sendData)
resolve(userId + "|" + filename)
if (withSignaling) {
weakself.delegate?.stopSignaling()
}
} catch {
print("an error occured", error)
}
}
if (withSignaling) {
do {
try self.delegate?.startSignaling([
"filename": filename,
"userId": args["userId"]!,
"languageCode": args["languageCode"]!,
"sampleRateHertz": args["sampleRateHertz"]!,
"encoding": args["encoding"]!
], attemptToConnect)
} catch {}
} else {
attemptToConnect()
}
}
}
func data_AudioFile_ReadProc(_ inClientData: UnsafeMutableRawPointer, _ inPosition: Int64, _ requestCount: UInt32, _ buffer: UnsafeMutableRawPointer, _ actualCount: UnsafeMutablePointer<UInt32>) -> OSStatus {
let data = inClientData.assumingMemoryBound(to: Data.self).pointee
let bufferPointer = UnsafeMutableRawBufferPointer(start: buffer, count: Int(requestCount))
let copied = data.copyBytes(to: bufferPointer, from: Int(inPosition) ..< Int(inPosition) + Int(requestCount))
actualCount.pointee = UInt32(copied)
return noErr
}
func data_AudioFile_GetSizeProc(_ inClientData: UnsafeMutableRawPointer) -> Int64 {
let data = inClientData.assumingMemoryBound(to: Data.self).pointee
return Int64(data.count)
}
extension Data {
func convertedTo(_ format: AVAudioFormat, _ cb: ((_ data: UnsafeMutableRawPointer, _ byteSize: Int) -> Void)? = nil) -> AVAudioPCMBuffer? {
var data = self
var af: AudioFileID? = nil
var status = AudioFileOpenWithCallbacks(&data, data_AudioFile_ReadProc, nil, data_AudioFile_GetSizeProc(_:), nil, 0, &af)
guard status == noErr, af != nil else {
return nil
}
defer {
AudioFileClose(af!)
}
var eaf: ExtAudioFileRef? = nil
status = ExtAudioFileWrapAudioFileID(af!, false, &eaf)
guard status == noErr, eaf != nil else {
return nil
}
defer {
ExtAudioFileDispose(eaf!)
}
var clientFormat = format.streamDescription.pointee
status = ExtAudioFileSetProperty(eaf!, kExtAudioFileProperty_ClientDataFormat, UInt32(MemoryLayout.size(ofValue: clientFormat)), &clientFormat)
guard status == noErr else {
return nil
}
if let channelLayout = format.channelLayout {
var clientChannelLayout = channelLayout.layout.pointee
status = ExtAudioFileSetProperty(eaf!, kExtAudioFileProperty_ClientChannelLayout, UInt32(MemoryLayout.size(ofValue: clientChannelLayout)), &clientChannelLayout)
guard status == noErr else {
return nil
}
}
var frameLength: Int64 = 0
var propertySize: UInt32 = UInt32(MemoryLayout.size(ofValue: frameLength))
status = ExtAudioFileGetProperty(eaf!, kExtAudioFileProperty_FileLengthFrames, &propertySize, &frameLength)
guard status == noErr else {
return nil
}
guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameLength)) else {
return nil
}
// 16000 samples per second
// 0.1s * 16000 = 1600
// 2 bytes per frame = 3200 buffer size
let bufferSizeFrames = 1600
let bufferSizeBytes = Int(format.streamDescription.pointee.mBytesPerFrame) * bufferSizeFrames
let numBuffers = format.isInterleaved ? 1 : Int(format.channelCount)
let numInterleavedChannels = format.isInterleaved ? Int(format.channelCount) : 1
let audioBufferList = AudioBufferList.allocate(maximumBuffers: numBuffers)
for i in 0 ..< numBuffers {
audioBufferList[i] = AudioBuffer(mNumberChannels: UInt32(numInterleavedChannels), mDataByteSize: UInt32(bufferSizeBytes), mData: malloc(bufferSizeBytes))
}
defer {
for buffer in audioBufferList {
free(buffer.mData)
}
free(audioBufferList.unsafeMutablePointer)
}
while true {
var frameCount: UInt32 = UInt32(bufferSizeFrames)
status = ExtAudioFileRead(eaf!, &frameCount, audioBufferList.unsafeMutablePointer)
guard status == noErr else {
return nil
}
if frameCount == 0 {
break
}
let src = audioBufferList
let dst = UnsafeMutableAudioBufferListPointer(pcmBuffer.mutableAudioBufferList)
if src.count != dst.count {
return nil
}
for i in 0 ..< src.count {
let srcBuf = src[i]
let dstBuf = dst[i]
if cb != nil {
cb!(srcBuf.mData!, Int(srcBuf.mDataByteSize))
}
memcpy(dstBuf.mData?.advanced(by: Int(dstBuf.mDataByteSize)), srcBuf.mData, Int(srcBuf.mDataByteSize))
}
pcmBuffer.frameLength += frameCount
}
return pcmBuffer
}
}
1条答案
按热度按时间qybjjes11#
1/使用该样品
https://github.com/googlesamples/assistant-sdk-python/blob/master/google-assistant-sdk/googlesamples/assistant/grpc/audiofileinput.py
创建def
2/使用pyaudio创建1的输入流,如下所示: