blob: 490591445239c49bc0e0610d22dbc1af48134f34 [file] [log] [blame]
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "chrome/browser/speech/speech_recognizer.h"
#include "base/ref_counted.h"
#include "base/scoped_ptr.h"
#include "base/time.h"
#include "chrome/browser/browser_thread.h"
#include "chrome/browser/profile.h"
#include "chrome/common/net/url_request_context_getter.h"
#include "third_party/speex/include/speex/speex.h"
using media::AudioInputController;
using std::list;
using std::string;
namespace {
const char* const kContentTypeSpeex =
"audio/x-speex-with-header-byte; rate=16000";
const int kSpeexEncodingQuality = 8;
const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz).
// Since the frame length gets written out as a byte in the encoded packet,
// make sure it is within the byte range.
COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
// The following constants are related to the volume level indicator shown in
// the UI for recorded audio.
// Multiplier used when new volume is greater than previous level.
const float kUpSmoothingFactor = 0.9f;
// Multiplier used when new volume is lesser than previous level.
const float kDownSmoothingFactor = 0.4f;
const float kAudioMeterMinDb = 10.0f; // Lower bar for volume meter.
const float kAudioMeterDbRange = 25.0f;
} // namespace
namespace speech_input {
const int SpeechRecognizer::kAudioSampleRate = 16000;
const int SpeechRecognizer::kAudioPacketIntervalMs = 100;
const int SpeechRecognizer::kNumAudioChannels = 1;
const int SpeechRecognizer::kNumBitsPerAudioSample = 16;
const int SpeechRecognizer::kNoSpeechTimeoutSec = 8;
const int SpeechRecognizer::kEndpointerEstimationTimeMs = 300;
// Provides a simple interface to encode raw audio using the Speex codec.
class SpeexEncoder {
public:
SpeexEncoder();
~SpeexEncoder();
int samples_per_frame() const { return samples_per_frame_; }
// Encodes each frame of raw audio in |samples| and adds the
// encoded frames as a set of strings to the |encoded_frames| list.
// Ownership of the newly added strings is transferred to the caller.
void Encode(const short* samples,
int num_samples,
std::list<std::string*>* encoded_frames);
private:
SpeexBits bits_;
void* encoder_state_;
int samples_per_frame_;
char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size.
};
SpeexEncoder::SpeexEncoder() {
// speex_bits_init() does not initialize all of the |bits_| struct.
memset(&bits_, 0, sizeof(bits_));
speex_bits_init(&bits_);
encoder_state_ = speex_encoder_init(&speex_wb_mode);
DCHECK(encoder_state_);
speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
DCHECK(samples_per_frame_ > 0);
int quality = kSpeexEncodingQuality;
speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
int vbr = 1;
speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
}
SpeexEncoder::~SpeexEncoder() {
speex_bits_destroy(&bits_);
speex_encoder_destroy(encoder_state_);
}
void SpeexEncoder::Encode(const short* samples,
int num_samples,
std::list<std::string*>* encoded_frames) {
// Drop incomplete frames, typically those which come in when recording stops.
num_samples -= (num_samples % samples_per_frame_);
for (int i = 0; i < num_samples; i += samples_per_frame_) {
speex_bits_reset(&bits_);
speex_encode_int(encoder_state_, const_cast<spx_int16_t*>(samples + i),
&bits_);
// Encode the frame and place the size of the frame as the first byte. This
// is the packet format for MIME type x-speex-with-header-byte.
int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
kMaxSpeexFrameLength);
encoded_frame_data_[0] = static_cast<char>(frame_length);
encoded_frames->push_back(new string(encoded_frame_data_,
frame_length + 1));
}
}
SpeechRecognizer::SpeechRecognizer(Delegate* delegate,
int caller_id,
const std::string& language,
const std::string& grammar,
const std::string& hardware_info)
: delegate_(delegate),
caller_id_(caller_id),
language_(language),
grammar_(grammar),
hardware_info_(hardware_info),
encoder_(new SpeexEncoder()),
endpointer_(kAudioSampleRate),
num_samples_recorded_(0),
audio_level_(0.0f) {
endpointer_.set_speech_input_complete_silence_length(
base::Time::kMicrosecondsPerSecond / 2);
endpointer_.set_long_speech_input_complete_silence_length(
base::Time::kMicrosecondsPerSecond);
endpointer_.set_long_speech_length(3 * base::Time::kMicrosecondsPerSecond);
endpointer_.StartSession();
}
SpeechRecognizer::~SpeechRecognizer() {
// Recording should have stopped earlier due to the endpointer or
// |StopRecording| being called.
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
DCHECK(audio_buffers_.empty());
endpointer_.EndSession();
}
bool SpeechRecognizer::StartRecording() {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
DCHECK(!audio_controller_.get());
DCHECK(!request_.get() || !request_->HasPendingRequest());
// The endpointer needs to estimate the environment/background noise before
// starting to treat the audio as user input. In |HandleOnData| we wait until
// such time has passed before switching to user input mode.
endpointer_.SetEnvironmentEstimationMode();
int samples_per_packet = (kAudioSampleRate * kAudioPacketIntervalMs) / 1000;
DCHECK((samples_per_packet % encoder_->samples_per_frame()) == 0);
AudioParameters params(AudioParameters::AUDIO_PCM_LINEAR, kNumAudioChannels,
kAudioSampleRate, kNumBitsPerAudioSample,
samples_per_packet);
audio_controller_ = AudioInputController::Create(this, params);
DCHECK(audio_controller_.get());
VLOG(1) << "SpeechRecognizer starting record.";
num_samples_recorded_ = 0;
audio_controller_->Record();
return true;
}
void SpeechRecognizer::CancelRecognition() {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
DCHECK(audio_controller_.get() || request_.get());
// Stop recording if required.
if (audio_controller_.get()) {
VLOG(1) << "SpeechRecognizer stopping record.";
audio_controller_->Close();
audio_controller_ = NULL; // Releases the ref ptr.
}
VLOG(1) << "SpeechRecognizer canceling recognition.";
ReleaseAudioBuffers();
request_.reset();
}
void SpeechRecognizer::StopRecording() {
DCHECK(BrowserThread::CurrentlyOn(BrowserThread::IO));
// If audio recording has already stopped and we are in recognition phase,
// silently ignore any more calls to stop recording.
if (!audio_controller_.get())
return;
VLOG(1) << "SpeechRecognizer stopping record.";
audio_controller_->Close();
audio_controller_ = NULL; // Releases the ref ptr.
delegate_->DidCompleteRecording(caller_id_);
// If we haven't got any audio yet end the recognition sequence here.
if (audio_buffers_.empty()) {
// Guard against the delegate freeing us until we finish our job.
scoped_refptr<SpeechRecognizer> me(this);
delegate_->DidCompleteRecognition(caller_id_);
return;
}
// We now have recorded audio in our buffers, so start a recognition request.
// Since the http request takes a single string as POST data, allocate
// one and copy over bytes from the audio buffers to the string.
int audio_buffer_length = 0;
for (AudioBufferQueue::iterator it = audio_buffers_.begin();
it != audio_buffers_.end(); it++) {
audio_buffer_length += (*it)->length();
}
string data;
data.reserve(audio_buffer_length);
for (AudioBufferQueue::iterator it = audio_buffers_.begin();
it != audio_buffers_.end(); it++) {
data.append(*(*it));
}
DCHECK(!request_.get());
request_.reset(new SpeechRecognitionRequest(
Profile::GetDefaultRequestContext(), this));
request_->Send(language_, grammar_, hardware_info_, kContentTypeSpeex, data);
ReleaseAudioBuffers(); // No need to keep the audio anymore.
}
void SpeechRecognizer::ReleaseAudioBuffers() {
for (AudioBufferQueue::iterator it = audio_buffers_.begin();
it != audio_buffers_.end(); it++)
delete *it;
audio_buffers_.clear();
}
// Invoked in the audio thread.
void SpeechRecognizer::OnError(AudioInputController* controller,
int error_code) {
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
NewRunnableMethod(this,
&SpeechRecognizer::HandleOnError,
error_code));
}
void SpeechRecognizer::HandleOnError(int error_code) {
LOG(WARNING) << "SpeechRecognizer::HandleOnError, code=" << error_code;
// Check if we are still recording before canceling recognition, as
// recording might have been stopped after this error was posted to the queue
// by |OnError|.
if (!audio_controller_.get())
return;
InformErrorAndCancelRecognition(RECOGNIZER_ERROR_CAPTURE);
}
void SpeechRecognizer::OnData(AudioInputController* controller,
const uint8* data, uint32 size) {
if (size == 0) // This could happen when recording stops and is normal.
return;
string* str_data = new string(reinterpret_cast<const char*>(data), size);
BrowserThread::PostTask(BrowserThread::IO, FROM_HERE,
NewRunnableMethod(this,
&SpeechRecognizer::HandleOnData,
str_data));
}
void SpeechRecognizer::HandleOnData(string* data) {
// Check if we are still recording and if not discard this buffer, as
// recording might have been stopped after this buffer was posted to the queue
// by |OnData|.
if (!audio_controller_.get()) {
delete data;
return;
}
const short* samples = reinterpret_cast<const short*>(data->data());
DCHECK((data->length() % sizeof(short)) == 0);
int num_samples = data->length() / sizeof(short);
encoder_->Encode(samples, num_samples, &audio_buffers_);
float rms;
endpointer_.ProcessAudio(samples, num_samples, &rms);
delete data;
num_samples_recorded_ += num_samples;
if (endpointer_.IsEstimatingEnvironment()) {
// Check if we have gathered enough audio for the endpointer to do
// environment estimation and should move on to detect speech/end of speech.
if (num_samples_recorded_ >= (kEndpointerEstimationTimeMs *
kAudioSampleRate) / 1000) {
endpointer_.SetUserInputMode();
delegate_->DidCompleteEnvironmentEstimation(caller_id_);
}
return; // No more processing since we are still estimating environment.
}
// Check if we have waited too long without hearing any speech.
if (!endpointer_.DidStartReceivingSpeech() &&
num_samples_recorded_ >= kNoSpeechTimeoutSec * kAudioSampleRate) {
InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_SPEECH);
return;
}
// Calculate the input volume to display in the UI, smoothing towards the
// new level.
float level = (rms - kAudioMeterMinDb) / kAudioMeterDbRange;
level = std::min(std::max(0.0f, level), 1.0f);
if (level > audio_level_) {
audio_level_ += (level - audio_level_) * kUpSmoothingFactor;
} else {
audio_level_ += (level - audio_level_) * kDownSmoothingFactor;
}
delegate_->SetInputVolume(caller_id_, audio_level_);
if (endpointer_.speech_input_complete()) {
StopRecording();
}
// TODO(satish): Once we have streaming POST, start sending the data received
// here as POST chunks.
}
void SpeechRecognizer::SetRecognitionResult(
bool error, const SpeechInputResultArray& result) {
if (result.empty()) {
InformErrorAndCancelRecognition(RECOGNIZER_ERROR_NO_RESULTS);
return;
}
delegate_->SetRecognitionResult(caller_id_, error, result);
// Guard against the delegate freeing us until we finish our job.
scoped_refptr<SpeechRecognizer> me(this);
delegate_->DidCompleteRecognition(caller_id_);
}
void SpeechRecognizer::InformErrorAndCancelRecognition(ErrorCode error) {
CancelRecognition();
// Guard against the delegate freeing us until we finish our job.
scoped_refptr<SpeechRecognizer> me(this);
delegate_->OnRecognizerError(caller_id_, error);
}
} // namespace speech_input