swift 将音频文件转换为Linear16格式并将原始数据流式传输到谷歌语音API

6kkfgxo0  于 2023-03-17  发布在  Swift
关注(0)|答案(1)|浏览(125)

我正在开发一个小应用程序来将音频数据流传输到google speech API.我设法从不同的堆栈溢出帖子中拼凑出以下函数来将本地音频文件转换为linear16格式.然后我通过udp将数据发送到我的服务器,在那里将数据流传输到google speech API.问题是音频没有被识别出来.我注意到,如果我在节点服务器上将Uint16Array更改为Uint8Array,我开始从服务器获得一些结果,但这是将16位数据转换为8位数据,我认为没有必要进行此转换。
发送到google speech的代码。原始数据被写入流中:

export const streamingRecognize = ({
  config,
  cb,
}: {
  config: SpeechToText.GsConfig
  cb?: (data: SpeechRecognitionResult) => void
}): Writable => {
  const request = {
    config,
    singleUtterance: false,
    interimResults: true, //Get interim results from stream
    enableWordTimeOffsets: true
  }
  const client = new speech.SpeechClient({ credentials: keyAndEmail })
  return client
    .streamingRecognize(request)
    .on('error', (err) => console.log(err))
    .on('data', (data) => {
      cb && cb(data)
    })
}
export const openStream = (audioConfig, callback) =>
  streamingRecognize({
    config: {
      languageCode: audioConfig.languageCode,
      sampleRateHertz: parseInt(audioConfig.sampleRateHertz),
      encoding: audioConfig.encoding as any
    },
    cb: (data: any) => {
      callback(data)
    },
  })b(data)
  })
const stream = openStream({
  sampleRateHertz: 16000,
  languageCode: 'en-US',
  encoding: 'LINEAR16' as any,
}, (data) => {
  console.log(data)
})
// here data is the linear16 audio data received from the ios client
const audioData = new Uint16Array(data)
const audioDataBuffer = Buffer.from(audioData)
stream.write(audioDataBuffer)

用于读取所述文件并将所述原始音频数据发送到服务器的代码。

import AVFoundation

extension Recorder {
  func openFile(
    _ args: NSDictionary,
    resolver resolve: @escaping RCTPromiseResolveBlock,
    rejecter reject: @escaping RCTPromiseRejectBlock
  ) {
    self.delegate?.stopSignaling()
    let withSignaling: Bool = (args["withSignaling"] != nil)
    let fullPath = args["url"]! as! String
    let userId = args["userId"] as! String
    let parts = fullPath.components(separatedBy: "/")
    let filename = parts[parts.count - 1]
    let sendData: (_ data: UnsafeMutableRawPointer, _ byteSize: Int) -> Void = {
      [weak self] data, byteSize in
      guard let weakself = self else {
        return
      }
      let imageData: Data = Data(bytes: data, count: byteSize)
      weakself.delegate?.sendEvent(
        withName: "fileTranscription",
        body: [
          "key": userId + "|" + filename,
          "data": imageData.withUnsafeBytes {
            Array($0.bindMemory(to: Int16.self)).map(Int16.init(bigEndian:))
          }
        ]
      )
      if (withSignaling) {
        weakself.delegate?.onAudioData(imageData)
      }
    }
    let attemptToConnect: () -> Void = {
      [weak self] in
      guard let weakself = self else {
        return
      }
      do {
        let url = URL(fileURLWithPath: fullPath)
        let data = try Data(contentsOf: url)
        let format = AVAudioFormat(commonFormat: .pcmFormatInt16, sampleRate: 16000, channels: 1, interleaved: false)!
        _ = data.convertedTo(format, sendData)
        resolve(userId + "|" + filename)
        if (withSignaling) {
          weakself.delegate?.stopSignaling()
        }
      } catch {
        print("an error occured", error)
      }
    }
    if (withSignaling) {
      do {
        try self.delegate?.startSignaling([
          "filename": filename,
          "userId": args["userId"]!,
          "languageCode": args["languageCode"]!,
          "sampleRateHertz": args["sampleRateHertz"]!,
          "encoding": args["encoding"]!
        ], attemptToConnect)
      } catch {}
    } else {
      attemptToConnect()
    }
  }
}

func data_AudioFile_ReadProc(_ inClientData: UnsafeMutableRawPointer, _ inPosition: Int64, _ requestCount: UInt32, _ buffer: UnsafeMutableRawPointer, _ actualCount: UnsafeMutablePointer<UInt32>) -> OSStatus {
    let data = inClientData.assumingMemoryBound(to: Data.self).pointee
    let bufferPointer = UnsafeMutableRawBufferPointer(start: buffer, count: Int(requestCount))
    let copied = data.copyBytes(to: bufferPointer, from: Int(inPosition) ..< Int(inPosition) + Int(requestCount))
    actualCount.pointee = UInt32(copied)
    return noErr
}

func data_AudioFile_GetSizeProc(_ inClientData: UnsafeMutableRawPointer) -> Int64 {
    let data = inClientData.assumingMemoryBound(to: Data.self).pointee
    return Int64(data.count)
}

extension Data {
  func convertedTo(_ format: AVAudioFormat, _ cb: ((_ data: UnsafeMutableRawPointer, _ byteSize: Int) -> Void)? = nil) -> AVAudioPCMBuffer? {
    var data = self

    var af: AudioFileID? = nil
    var status = AudioFileOpenWithCallbacks(&data, data_AudioFile_ReadProc, nil, data_AudioFile_GetSizeProc(_:), nil, 0, &af)
    guard status == noErr, af != nil else {
      return nil
    }

    defer {
      AudioFileClose(af!)
    }

    var eaf: ExtAudioFileRef? = nil
    status = ExtAudioFileWrapAudioFileID(af!, false, &eaf)
    guard status == noErr, eaf != nil else {
      return nil
    }

    defer {
      ExtAudioFileDispose(eaf!)
    }

    var clientFormat = format.streamDescription.pointee
    status = ExtAudioFileSetProperty(eaf!, kExtAudioFileProperty_ClientDataFormat, UInt32(MemoryLayout.size(ofValue: clientFormat)), &clientFormat)
    guard status == noErr else {
        return nil
    }

    if let channelLayout = format.channelLayout {
      var clientChannelLayout = channelLayout.layout.pointee
      status = ExtAudioFileSetProperty(eaf!, kExtAudioFileProperty_ClientChannelLayout, UInt32(MemoryLayout.size(ofValue: clientChannelLayout)), &clientChannelLayout)
      guard status == noErr else {
        return nil
      }
    }

    var frameLength: Int64 = 0
    var propertySize: UInt32 = UInt32(MemoryLayout.size(ofValue: frameLength))
    status = ExtAudioFileGetProperty(eaf!, kExtAudioFileProperty_FileLengthFrames, &propertySize, &frameLength)
    guard status == noErr else {
      return nil
    }

    guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: AVAudioFrameCount(frameLength)) else {
      return nil
    }
    // 16000 samples per second
    // 0.1s * 16000 = 1600
    // 2 bytes per frame = 3200 buffer size
    let bufferSizeFrames = 1600
    let bufferSizeBytes = Int(format.streamDescription.pointee.mBytesPerFrame) * bufferSizeFrames
    let numBuffers = format.isInterleaved ? 1 : Int(format.channelCount)
    let numInterleavedChannels = format.isInterleaved ? Int(format.channelCount) : 1
    let audioBufferList = AudioBufferList.allocate(maximumBuffers: numBuffers)
    for i in 0 ..< numBuffers {
      audioBufferList[i] = AudioBuffer(mNumberChannels: UInt32(numInterleavedChannels), mDataByteSize: UInt32(bufferSizeBytes), mData: malloc(bufferSizeBytes))
    }

    defer {
      for buffer in audioBufferList {
        free(buffer.mData)
      }
      free(audioBufferList.unsafeMutablePointer)
    }

    while true {
      var frameCount: UInt32 = UInt32(bufferSizeFrames)
      status = ExtAudioFileRead(eaf!, &frameCount, audioBufferList.unsafeMutablePointer)
      guard status == noErr else {
        return nil
      }

      if frameCount == 0 {
        break
      }

      let src = audioBufferList
      let dst = UnsafeMutableAudioBufferListPointer(pcmBuffer.mutableAudioBufferList)

      if src.count != dst.count {
        return nil
      }
    
      for i in 0 ..< src.count {
        let srcBuf = src[i]
        let dstBuf = dst[i]
        if cb != nil {
          cb!(srcBuf.mData!, Int(srcBuf.mDataByteSize))
        }
        memcpy(dstBuf.mData?.advanced(by: Int(dstBuf.mDataByteSize)), srcBuf.mData, Int(srcBuf.mDataByteSize))
      }

      pcmBuffer.frameLength += frameCount
    }

    return pcmBuffer
  }
}
qybjjes1

qybjjes11#

1/使用该样品
https://github.com/googlesamples/assistant-sdk-python/blob/master/google-assistant-sdk/googlesamples/assistant/grpc/audiofileinput.py
创建def

def speech2text(audio_stream):

....

2/使用pyaudio创建1的输入流,如下所示:

def main():
    start_time=get_current_time()        
    final_result=''
    while get_current_time() - start_time < STREAMING_LIMIT:    
        pa = None
        audio_stream = None    
        # try:
            # porcupine = pvporcupine.create(keywords=keywords,sensitivities=sensitivities)
        pa = pyaudio.PyAudio()
        audio_stream = pa.open(
                        rate=DEFAULT_AUDIO_SAMPLE_RATE,
                        channels=1,
                        format=pyaudio.paInt16,
                        input=True,
                        output_device_index=None,
                    frames_per_buffer=512)                    
        final_result=speech2text(audio_stream)
        if len(final_result) >0:
            print(colored('[HUMAN]: '+final_result,'yellow'))            
        pa.terminate()
        audio_stream.close()   
        break

相关问题