blob: 42ef5d3b6f2887e45829e1d21436bc676fe90560 [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.research;
import android.util.Log;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Dictionary;
import com.android.inputmethod.latin.Suggest;
import com.android.inputmethod.latin.define.ProductionFlag;
import java.util.ArrayList;
import java.util.LinkedList;
/**
* MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
*
* There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
* be logged in enough detail to determine their contents, 2) only a subset of words are logged
* in detail, such as 10%, and 3) no numbers are logged.
*
* This class maintains a list of LogUnits, each corresponding to a word. As the user completes
* words, they are added here. But if the user backs up over their current word to edit a word
* entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
* the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled
* back out even after they are pushed in, we must not publish the contents of this LogBuffer too
* quickly. However, we cannot let the contents pile up either, or it will limit the editing that
* a user can perform.
*
* To balance these requirements (keep history so user can edit, flush history so it does not pile
* up), the LogBuffer is considered "complete" when the user has entered enough words to form an
* n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
* Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
* However, the additional non-detailed words are retained, in case the user backspaces to edit
* them. The MainLogBuffer then continues to add words, publishing individual non-detailed words
* as new words arrive. After enough non-detailed words have been pushed out to account for the
* 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
*
* If the words that would form the valid n-gram are not in the dictionary, then words are pushed
* through the LogBuffer one at a time until an n-gram is found that is entirely composed of
* dictionary words.
*
* If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
* n-gram containing dictionary words.
*/
public abstract class MainLogBuffer extends FixedLogBuffer {
private static final String TAG = MainLogBuffer.class.getSimpleName();
private static final boolean DEBUG = false
&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;
// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
public static final int N_GRAM_SIZE = 2;
// TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
// method.
private final Suggest mSuggest;
@UsedForTesting
private Dictionary mDictionaryForTesting;
private boolean mIsStopping = false;
/* package for test */ int mNumWordsBetweenNGrams;
// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
// after a sample is taken.
/* package for test */ int mNumWordsUntilSafeToSample;
public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
final Suggest suggest) {
super(N_GRAM_SIZE + wordsBetweenSamples);
mNumWordsBetweenNGrams = wordsBetweenSamples;
mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
mSuggest = suggest;
}
@UsedForTesting
/* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
mDictionaryForTesting = dictionary;
}
private Dictionary getDictionary() {
if (mDictionaryForTesting != null) {
return mDictionaryForTesting;
}
if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
return mSuggest.getMainDictionary();
}
public void setIsStopping() {
mIsStopping = true;
}
/**
* Determines whether uploading the n words at the front the MainLogBuffer will not violate
* user privacy.
*
* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
* non-character data that is typed between words. The decision about privacy is made based on
* the buffer's entire content. If it is decided that the privacy risks are too great to upload
* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
* the screen orientation and other characteristics about the device can be uploaded without
* revealing much about the user.
*/
private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
// Bypass privacy checks when debugging.
if (ResearchLogger.IS_LOGGING_EVERYTHING) {
if (mIsStopping) {
return true;
}
// Only check that it is the right length. If not, wait for later words to make
// complete n-grams.
int numWordsInLogUnitList = 0;
final int length = logUnits.size();
for (int i = 0; i < length; i++) {
final LogUnit logUnit = logUnits.get(i);
numWordsInLogUnitList += logUnit.getNumWords();
}
return numWordsInLogUnitList >= minNGramSize;
}
// Check that we are not sampling too frequently. Having sampled recently might disclose
// too much of the user's intended meaning.
if (mNumWordsUntilSafeToSample > 0) {
return false;
}
// Reload the dictionary in case it has changed (e.g., because the user has changed
// languages).
final Dictionary dictionary = getDictionary();
if (dictionary == null) {
// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
// contents to potentially pose a privacy risk.
return false;
}
// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
// the complete buffer contents in detail.
int numWordsInLogUnitList = 0;
final int length = logUnits.size();
for (final LogUnit logUnit : logUnits) {
if (!logUnit.hasOneOrMoreWords()) {
// Digits outside words are a privacy threat.
if (logUnit.mayContainDigit()) {
return false;
}
} else {
numWordsInLogUnitList += logUnit.getNumWords();
final String[] words = logUnit.getWordsAsStringArray();
for (final String word : words) {
// Words not in the dictionary are a privacy threat.
if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
if (DEBUG) {
Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
+ ResearchLogger.hasLetters(word)
+ ", isValid: " + (dictionary.isValidWord(word)));
}
return false;
}
}
}
}
// Finally, only return true if the ngram is the right size.
return numWordsInLogUnitList == minNGramSize;
}
public void shiftAndPublishAll() {
final LinkedList<LogUnit> logUnits = getLogUnits();
while (!logUnits.isEmpty()) {
publishLogUnitsAtFrontOfBuffer();
}
}
@Override
protected final void onBufferFull() {
publishLogUnitsAtFrontOfBuffer();
}
protected final void publishLogUnitsAtFrontOfBuffer() {
ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
// Good n-gram at the front of the buffer. Publish it, disclosing details.
publish(logUnits, true /* canIncludePrivateData */);
shiftOutWords(N_GRAM_SIZE);
mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
} else {
// No good n-gram at front, and buffer is full. Shift out up through the first logUnit
// with associated words (or if there is none, all the existing logUnits).
logUnits.clear();
for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords();
logUnit = shiftOut()) {
logUnits.add(logUnit);
}
publish(logUnits, false /* canIncludePrivateData */);
}
}
/**
* Called when a list of logUnits should be published.
*
* It is the subclass's responsibility to implement the publication.
*
* @param logUnits The list of logUnits to be published.
* @param canIncludePrivateData Whether the private data in the logUnits can be included in
* publication.
*/
protected abstract void publish(final ArrayList<LogUnit> logUnits,
final boolean canIncludePrivateData);
@Override
protected int shiftOutWords(final int numWords) {
final int numWordContainingLogUnitsShiftedOut = super.shiftOutWords(numWords);
mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample
- numWordContainingLogUnitsShiftedOut);
if (DEBUG) {
Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
}
return numWordContainingLogUnitsShiftedOut;
}
}