blob: 3b0f7079b72850bdfc9db10939ea0478ff7ec174 [file] [log] [blame]
// Copyright (c) 2010 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
#define CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_
#include <list>
#include <string>
#include <utility>
#include "base/ref_counted.h"
#include "base/scoped_ptr.h"
#include "chrome/browser/speech/endpointer/endpointer.h"
#include "chrome/browser/speech/speech_recognition_request.h"
#include "media/audio/audio_input_controller.h"
namespace speech_input {
class SpeexEncoder;
// Records audio, sends recorded audio to server and translates server response
// to recognition result.
class SpeechRecognizer
: public base::RefCountedThreadSafe<SpeechRecognizer>,
public media::AudioInputController::EventHandler,
public SpeechRecognitionRequestDelegate {
public:
enum ErrorCode {
RECOGNIZER_NO_ERROR,
RECOGNIZER_ERROR_CAPTURE,
RECOGNIZER_ERROR_NO_SPEECH,
RECOGNIZER_ERROR_NO_RESULTS,
};
// Implemented by the caller to receive recognition events.
class Delegate {
public:
virtual void SetRecognitionResult(
int caller_id,
bool error,
const SpeechInputResultArray& result) = 0;
// Invoked when audio recording stops, either due to the end pointer
// detecting silence in user input or if |StopRecording| was called. The
// delegate has to wait until |DidCompleteRecognition| is invoked before
// destroying the |SpeechRecognizer| object.
virtual void DidCompleteRecording(int caller_id) = 0;
// This is guaranteed to be the last method invoked in the recognition
// sequence and the |SpeechRecognizer| object can be freed up if necessary.
virtual void DidCompleteRecognition(int caller_id) = 0;
// Invoked if there was an error while recording or recognizing audio. The
// session has already been cancelled when this call is made and the DidXxxx
// callbacks will not be issued. It is safe to destroy/release the
// |SpeechRecognizer| object while processing this call.
virtual void OnRecognizerError(int caller_id,
SpeechRecognizer::ErrorCode error) = 0;
// At the start of recognition, a short amount of audio is recorded to
// estimate the environment/background noise and this callback is issued
// after that is complete. Typically the delegate brings up any speech
// recognition UI once this callback is received.
virtual void DidCompleteEnvironmentEstimation(int caller_id) = 0;
// Informs of a change in the captured audio level, useful if displaying
// a microphone volume indicator while recording.
// The value of |volume| is in the [0.0, 1.0] range.
virtual void SetInputVolume(int caller_id, float volume) = 0;
protected:
virtual ~Delegate() {}
};
SpeechRecognizer(Delegate* delegate,
int caller_id,
const std::string& language,
const std::string& grammar,
const std::string& hardware_info);
~SpeechRecognizer();
// Starts audio recording and does recognition after recording ends. The same
// SpeechRecognizer instance can be used multiple times for speech recognition
// though each recognition request can be made only after the previous one
// completes (i.e. after receiving Delegate::DidCompleteRecognition).
bool StartRecording();
// Stops recording audio and starts recognition.
void StopRecording();
// Stops recording audio and cancels recognition. Any audio recorded so far
// gets discarded.
void CancelRecognition();
// AudioInputController::EventHandler methods.
void OnCreated(media::AudioInputController* controller) { }
void OnRecording(media::AudioInputController* controller) { }
void OnError(media::AudioInputController* controller, int error_code);
void OnData(media::AudioInputController* controller, const uint8* data,
uint32 size);
// SpeechRecognitionRequest::Delegate methods.
void SetRecognitionResult(bool error, const SpeechInputResultArray& result);
static const int kAudioSampleRate;
static const int kAudioPacketIntervalMs; // Duration of each audio packet.
static const int kNumAudioChannels;
static const int kNumBitsPerAudioSample;
static const int kNoSpeechTimeoutSec;
static const int kEndpointerEstimationTimeMs;
private:
void ReleaseAudioBuffers();
void InformErrorAndCancelRecognition(ErrorCode error);
void SendRecordedAudioToServer();
void HandleOnError(int error_code); // Handles OnError in the IO thread.
// Handles OnData in the IO thread. Takes ownership of |data|.
void HandleOnData(std::string* data);
Delegate* delegate_;
int caller_id_;
std::string language_;
std::string grammar_;
std::string hardware_info_;
// Buffer holding the recorded audio. Owns the strings inside the list.
typedef std::list<std::string*> AudioBufferQueue;
AudioBufferQueue audio_buffers_;
scoped_ptr<SpeechRecognitionRequest> request_;
scoped_refptr<media::AudioInputController> audio_controller_;
scoped_ptr<SpeexEncoder> encoder_;
Endpointer endpointer_;
int num_samples_recorded_;
float audio_level_;
DISALLOW_COPY_AND_ASSIGN(SpeechRecognizer);
};
// This typedef is to workaround the issue with certain versions of
// Visual Studio where it gets confused between multiple Delegate
// classes and gives a C2500 error. (I saw this error on the try bots -
// the workaround was not needed for my machine).
typedef SpeechRecognizer::Delegate SpeechRecognizerDelegate;
} // namespace speech_input
#endif // CHROME_BROWSER_SPEECH_SPEECH_RECOGNIZER_H_