java/src/com/android/inputmethod/research/MainLogBuffer.java - platform/packages/inputmethods/LatinIME - Git at Google

 /*
  * Copyright (C) 2012 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package com.android.inputmethod.research;

 import android.util.Log;

 import com.android.inputmethod.annotations.UsedForTesting;
 import com.android.inputmethod.latin.Dictionary;
 import com.android.inputmethod.latin.Suggest;
 import com.android.inputmethod.latin.define.ProductionFlag;

 import java.util.ArrayList;
 import java.util.LinkedList;

 /**
  * MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
  *
  * There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
  * be logged in enough detail to determine their contents, 2) only a subset of words are logged
  * in detail, such as 10%, and 3) no numbers are logged.
  *
  * This class maintains a list of LogUnits, each corresponding to a word.  As the user completes
  * words, they are added here.  But if the user backs up over their current word to edit a word
  * entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
  * the LogUnit, and it is pushed back in here when the user is done.  Because words may be pulled
  * back out even after they are pushed in, we must not publish the contents of this LogBuffer too
  * quickly.  However, we cannot let the contents pile up either, or it will limit the editing that
  * a user can perform.
  *
  * To balance these requirements (keep history so user can edit, flush history so it does not pile
  * up), the LogBuffer is considered "complete" when the user has entered enough words to form an
  * n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
  * Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
  * However, the additional non-detailed words are retained, in case the user backspaces to edit
  * them.  The MainLogBuffer then continues to add words, publishing individual non-detailed words
  * as new words arrive.  After enough non-detailed words have been pushed out to account for the
  * 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
  *
  * If the words that would form the valid n-gram are not in the dictionary, then words are pushed
  * through the LogBuffer one at a time until an n-gram is found that is entirely composed of
  * dictionary words.
  *
  * If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
  * n-gram containing dictionary words.
  */
 public abstract class MainLogBuffer extends FixedLogBuffer {
     private static final String TAG = MainLogBuffer.class.getSimpleName();
     private static final boolean DEBUG = false
             && ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;

     // The size of the n-grams logged.  E.g. N_GRAM_SIZE = 2 means to sample bigrams.
     public static final int N_GRAM_SIZE = 2;

     // TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
     // method.
     private final Suggest mSuggest;
     @UsedForTesting
     private Dictionary mDictionaryForTesting;
     private boolean mIsStopping = false;

     /* package for test */ int mNumWordsBetweenNGrams;

     // Counter for words left to suppress before an n-gram can be sampled.  Reset to mMinWordPeriod
     // after a sample is taken.
     /* package for test */ int mNumWordsUntilSafeToSample;

     public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
             final Suggest suggest) {
         super(N_GRAM_SIZE + wordsBetweenSamples);
         mNumWordsBetweenNGrams = wordsBetweenSamples;
         mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
         mSuggest = suggest;
     }

     @UsedForTesting
     /* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
         mDictionaryForTesting = dictionary;
     }

     private Dictionary getDictionary() {
         if (mDictionaryForTesting != null) {
             return mDictionaryForTesting;
         }
         if (mSuggest == null || !mSuggest.hasMainDictionary()) return null;
         return mSuggest.getMainDictionary();
     }

     public void setIsStopping() {
         mIsStopping = true;
     }

     /**
      * Determines whether uploading the n words at the front the MainLogBuffer will not violate
      * user privacy.
      *
      * The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
      * non-character data that is typed between words.  The decision about privacy is made based on
      * the buffer's entire content.  If it is decided that the privacy risks are too great to upload
      * the contents of this buffer, a censored version of the LogItems may still be uploaded.  E.g.,
      * the screen orientation and other characteristics about the device can be uploaded without
      * revealing much about the user.
      */
     private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
         // Bypass privacy checks when debugging.
         if (ResearchLogger.IS_LOGGING_EVERYTHING) {
             if (mIsStopping) {
                 return true;
             }
             // Only check that it is the right length.  If not, wait for later words to make
             // complete n-grams.
             int numWordsInLogUnitList = 0;
             final int length = logUnits.size();
             for (int i = 0; i < length; i++) {
                 final LogUnit logUnit = logUnits.get(i);
                 numWordsInLogUnitList += logUnit.getNumWords();
             }
             return numWordsInLogUnitList >= minNGramSize;
         }

         // Check that we are not sampling too frequently.  Having sampled recently might disclose
         // too much of the user's intended meaning.
         if (mNumWordsUntilSafeToSample > 0) {
             return false;
         }
         // Reload the dictionary in case it has changed (e.g., because the user has changed
         // languages).
         final Dictionary dictionary = getDictionary();
         if (dictionary == null) {
             // Main dictionary is unavailable.  Since we cannot check it, we cannot tell if a
             // word is out-of-vocabulary or not.  Therefore, we must judge the entire buffer
             // contents to potentially pose a privacy risk.
             return false;
         }

         // Check each word in the buffer.  If any word poses a privacy threat, we cannot upload
         // the complete buffer contents in detail.
         int numWordsInLogUnitList = 0;
         final int length = logUnits.size();
         for (final LogUnit logUnit : logUnits) {
             if (!logUnit.hasOneOrMoreWords()) {
                 // Digits outside words are a privacy threat.
                 if (logUnit.mayContainDigit()) {
                     return false;
                 }
             } else {
                 numWordsInLogUnitList += logUnit.getNumWords();
                 final String[] words = logUnit.getWordsAsStringArray();
                 for (final String word : words) {
                     // Words not in the dictionary are a privacy threat.
                     if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
                         if (DEBUG) {
                             Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
                                     + ResearchLogger.hasLetters(word)
                                     + ", isValid: " + (dictionary.isValidWord(word)));
                         }
                         return false;
                     }
                 }
             }
         }

         // Finally, only return true if the ngram is the right size.
         return numWordsInLogUnitList == minNGramSize;
     }

     public void shiftAndPublishAll() {
         final LinkedList<LogUnit> logUnits = getLogUnits();
         while (!logUnits.isEmpty()) {
             publishLogUnitsAtFrontOfBuffer();
         }
     }

     @Override
     protected final void onBufferFull() {
         publishLogUnitsAtFrontOfBuffer();
     }

     protected final void publishLogUnitsAtFrontOfBuffer() {
         ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
         if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
             // Good n-gram at the front of the buffer.  Publish it, disclosing details.
             publish(logUnits, true /* canIncludePrivateData */);
             shiftOutWords(N_GRAM_SIZE);
             mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
         } else {
             // No good n-gram at front, and buffer is full.  Shift out up through the first logUnit
             // with associated words (or if there is none, all the existing logUnits).
             logUnits.clear();
             for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords();
                     logUnit = shiftOut()) {
                 logUnits.add(logUnit);
             }
             publish(logUnits, false /* canIncludePrivateData */);
         }
     }

     /**
      * Called when a list of logUnits should be published.
      *
      * It is the subclass's responsibility to implement the publication.
      *
      * @param logUnits The list of logUnits to be published.
      * @param canIncludePrivateData Whether the private data in the logUnits can be included in
      * publication.
      */
     protected abstract void publish(final ArrayList<LogUnit> logUnits,
             final boolean canIncludePrivateData);

     @Override
     protected int shiftOutWords(final int numWords) {
         final int numWordContainingLogUnitsShiftedOut = super.shiftOutWords(numWords);
         mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample
                 - numWordContainingLogUnitsShiftedOut);
         if (DEBUG) {
             Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
         }
         return numWordContainingLogUnitsShiftedOut;
     }
 }
	/*
	* Copyright (C) 2012 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package com.android.inputmethod.research;

	import android.util.Log;

	import com.android.inputmethod.annotations.UsedForTesting;
	import com.android.inputmethod.latin.Dictionary;
	import com.android.inputmethod.latin.Suggest;
	import com.android.inputmethod.latin.define.ProductionFlag;

	import java.util.ArrayList;
	import java.util.LinkedList;

	/**
	* MainLogBuffer is a FixedLogBuffer that tracks the state of LogUnits to make privacy guarantees.
	*
	* There are three forms of privacy protection: 1) only words in the main dictionary are allowed to
	* be logged in enough detail to determine their contents, 2) only a subset of words are logged
	* in detail, such as 10%, and 3) no numbers are logged.
	*
	* This class maintains a list of LogUnits, each corresponding to a word. As the user completes
	* words, they are added here. But if the user backs up over their current word to edit a word
	* entered earlier, then it is pulled out of this LogBuffer, changes are then added to the end of
	* the LogUnit, and it is pushed back in here when the user is done. Because words may be pulled
	* back out even after they are pushed in, we must not publish the contents of this LogBuffer too
	* quickly. However, we cannot let the contents pile up either, or it will limit the editing that
	* a user can perform.
	*
	* To balance these requirements (keep history so user can edit, flush history so it does not pile
	* up), the LogBuffer is considered "complete" when the user has entered enough words to form an
	* n-gram, followed by enough additional non-detailed words (that are in the 90%, as per above).
	* Once complete, the n-gram may be published to flash storage (via the ResearchLog class).
	* However, the additional non-detailed words are retained, in case the user backspaces to edit
	* them. The MainLogBuffer then continues to add words, publishing individual non-detailed words
	* as new words arrive. After enough non-detailed words have been pushed out to account for the
	* 90% between words, the words at the front of the LogBuffer can be published as an n-gram again.
	*
	* If the words that would form the valid n-gram are not in the dictionary, then words are pushed
	* through the LogBuffer one at a time until an n-gram is found that is entirely composed of
	* dictionary words.
	*
	* If the user closes a session, then the entire LogBuffer is flushed, publishing any embedded
	* n-gram containing dictionary words.
	*/
	public abstract class MainLogBuffer extends FixedLogBuffer {
	private static final String TAG = MainLogBuffer.class.getSimpleName();
	private static final boolean DEBUG = false
	&& ProductionFlag.USES_DEVELOPMENT_ONLY_DIAGNOSTICS_DEBUG;

	// The size of the n-grams logged. E.g. N_GRAM_SIZE = 2 means to sample bigrams.
	public static final int N_GRAM_SIZE = 2;

	// TODO: Remove dependence on Suggest, and pass in Dictionary as a parameter to an appropriate
	// method.
	private final Suggest mSuggest;
	@UsedForTesting
	private Dictionary mDictionaryForTesting;
	private boolean mIsStopping = false;

	/* package for test */ int mNumWordsBetweenNGrams;

	// Counter for words left to suppress before an n-gram can be sampled. Reset to mMinWordPeriod
	// after a sample is taken.
	/* package for test */ int mNumWordsUntilSafeToSample;

	public MainLogBuffer(final int wordsBetweenSamples, final int numInitialWordsToIgnore,
	final Suggest suggest) {
	super(N_GRAM_SIZE + wordsBetweenSamples);
	mNumWordsBetweenNGrams = wordsBetweenSamples;
	mNumWordsUntilSafeToSample = DEBUG ? 0 : numInitialWordsToIgnore;
	mSuggest = suggest;
	}

	@UsedForTesting
	/* package for test */ void setDictionaryForTesting(final Dictionary dictionary) {
	mDictionaryForTesting = dictionary;
	}

	private Dictionary getDictionary() {
	if (mDictionaryForTesting != null) {
	return mDictionaryForTesting;
	}
	if (mSuggest == null \|\| !mSuggest.hasMainDictionary()) return null;
	return mSuggest.getMainDictionary();
	}

	public void setIsStopping() {
	mIsStopping = true;
	}

	/**
	* Determines whether uploading the n words at the front the MainLogBuffer will not violate
	* user privacy.
	*
	* The size of the MainLogBuffer is just enough to hold one n-gram, its corrections, and any
	* non-character data that is typed between words. The decision about privacy is made based on
	* the buffer's entire content. If it is decided that the privacy risks are too great to upload
	* the contents of this buffer, a censored version of the LogItems may still be uploaded. E.g.,
	* the screen orientation and other characteristics about the device can be uploaded without
	* revealing much about the user.
	*/
	private boolean isSafeNGram(final ArrayList<LogUnit> logUnits, final int minNGramSize) {
	// Bypass privacy checks when debugging.
	if (ResearchLogger.IS_LOGGING_EVERYTHING) {
	if (mIsStopping) {
	return true;
	}
	// Only check that it is the right length. If not, wait for later words to make
	// complete n-grams.
	int numWordsInLogUnitList = 0;
	final int length = logUnits.size();
	for (int i = 0; i < length; i++) {
	final LogUnit logUnit = logUnits.get(i);
	numWordsInLogUnitList += logUnit.getNumWords();
	}
	return numWordsInLogUnitList >= minNGramSize;
	}

	// Check that we are not sampling too frequently. Having sampled recently might disclose
	// too much of the user's intended meaning.
	if (mNumWordsUntilSafeToSample > 0) {
	return false;
	}
	// Reload the dictionary in case it has changed (e.g., because the user has changed
	// languages).
	final Dictionary dictionary = getDictionary();
	if (dictionary == null) {
	// Main dictionary is unavailable. Since we cannot check it, we cannot tell if a
	// word is out-of-vocabulary or not. Therefore, we must judge the entire buffer
	// contents to potentially pose a privacy risk.
	return false;
	}

	// Check each word in the buffer. If any word poses a privacy threat, we cannot upload
	// the complete buffer contents in detail.
	int numWordsInLogUnitList = 0;
	final int length = logUnits.size();
	for (final LogUnit logUnit : logUnits) {
	if (!logUnit.hasOneOrMoreWords()) {
	// Digits outside words are a privacy threat.
	if (logUnit.mayContainDigit()) {
	return false;
	}
	} else {
	numWordsInLogUnitList += logUnit.getNumWords();
	final String[] words = logUnit.getWordsAsStringArray();
	for (final String word : words) {
	// Words not in the dictionary are a privacy threat.
	if (ResearchLogger.hasLetters(word) && !(dictionary.isValidWord(word))) {
	if (DEBUG) {
	Log.d(TAG, "\"" + word + "\" NOT SAFE!: hasLetters: "
	+ ResearchLogger.hasLetters(word)
	+ ", isValid: " + (dictionary.isValidWord(word)));
	}
	return false;
	}
	}
	}
	}

	// Finally, only return true if the ngram is the right size.
	return numWordsInLogUnitList == minNGramSize;
	}

	public void shiftAndPublishAll() {
	final LinkedList<LogUnit> logUnits = getLogUnits();
	while (!logUnits.isEmpty()) {
	publishLogUnitsAtFrontOfBuffer();
	}
	}

	@Override
	protected final void onBufferFull() {
	publishLogUnitsAtFrontOfBuffer();
	}

	protected final void publishLogUnitsAtFrontOfBuffer() {
	ArrayList<LogUnit> logUnits = peekAtFirstNWords(N_GRAM_SIZE);
	if (isSafeNGram(logUnits, N_GRAM_SIZE)) {
	// Good n-gram at the front of the buffer. Publish it, disclosing details.
	publish(logUnits, true /* canIncludePrivateData */);
	shiftOutWords(N_GRAM_SIZE);
	mNumWordsUntilSafeToSample = mNumWordsBetweenNGrams;
	} else {
	// No good n-gram at front, and buffer is full. Shift out up through the first logUnit
	// with associated words (or if there is none, all the existing logUnits).
	logUnits.clear();
	for (LogUnit logUnit = shiftOut(); logUnit != null && !logUnit.hasOneOrMoreWords();
	logUnit = shiftOut()) {
	logUnits.add(logUnit);
	}
	publish(logUnits, false /* canIncludePrivateData */);
	}
	}

	/**
	* Called when a list of logUnits should be published.
	*
	* It is the subclass's responsibility to implement the publication.
	*
	* @param logUnits The list of logUnits to be published.
	* @param canIncludePrivateData Whether the private data in the logUnits can be included in
	* publication.
	*/
	protected abstract void publish(final ArrayList<LogUnit> logUnits,
	final boolean canIncludePrivateData);

	@Override
	protected int shiftOutWords(final int numWords) {
	final int numWordContainingLogUnitsShiftedOut = super.shiftOutWords(numWords);
	mNumWordsUntilSafeToSample = Math.max(0, mNumWordsUntilSafeToSample
	- numWordContainingLogUnitsShiftedOut);
	if (DEBUG) {
	Log.d(TAG, "wordsUntilSafeToSample now at " + mNumWordsUntilSafeToSample);
	}
	return numWordContainingLogUnitsShiftedOut;
	}
	}