blob: c87a9254dee1197b3b9cff1ed3ee30d30efe9d75 [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.android.inputmethod.latin.makedict;
import com.android.inputmethod.annotations.UsedForTesting;
import com.android.inputmethod.latin.Constants;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.CharEncoding;
import com.android.inputmethod.latin.makedict.BinaryDictInputOutput.FusionDictionaryBufferInterface;
import com.android.inputmethod.latin.makedict.FormatSpec.FileHeader;
import com.android.inputmethod.latin.makedict.FormatSpec.FormatOptions;
import com.android.inputmethod.latin.makedict.FusionDictionary.CharGroup;
import com.android.inputmethod.latin.makedict.FusionDictionary.WeightedString;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.Stack;
public final class BinaryDictIOUtils {
private static final boolean DBG = false;
private static final int MSB24 = 0x800000;
private static final int SINT24_MAX = 0x7FFFFF;
private static final int MAX_JUMPS = 10000;
private BinaryDictIOUtils() {
// This utility class is not publicly instantiable.
}
private static final class Position {
public static final int NOT_READ_GROUPCOUNT = -1;
public int mAddress;
public int mNumOfCharGroup;
public int mPosition;
public int mLength;
public Position(int address, int length) {
mAddress = address;
mLength = length;
mNumOfCharGroup = NOT_READ_GROUPCOUNT;
}
}
/**
* Tours all node without recursive call.
*/
private static void readUnigramsAndBigramsBinaryInner(
final FusionDictionaryBufferInterface buffer, final int headerSize,
final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
final Map<Integer, ArrayList<PendingAttribute>> bigrams,
final FormatOptions formatOptions) {
int[] pushedChars = new int[FormatSpec.MAX_WORD_LENGTH + 1];
Stack<Position> stack = new Stack<Position>();
int index = 0;
Position initPos = new Position(headerSize, 0);
stack.push(initPos);
while (!stack.empty()) {
Position p = stack.peek();
if (DBG) {
MakedictLog.d("read: address=" + p.mAddress + ", numOfCharGroup=" +
p.mNumOfCharGroup + ", position=" + p.mPosition + ", length=" + p.mLength);
}
if (buffer.position() != p.mAddress) buffer.position(p.mAddress);
if (index != p.mLength) index = p.mLength;
if (p.mNumOfCharGroup == Position.NOT_READ_GROUPCOUNT) {
p.mNumOfCharGroup = BinaryDictInputOutput.readCharGroupCount(buffer);
p.mAddress += BinaryDictInputOutput.getGroupCountSize(p.mNumOfCharGroup);
p.mPosition = 0;
}
if (p.mNumOfCharGroup == 0) {
stack.pop();
continue;
}
CharGroupInfo info = BinaryDictInputOutput.readCharGroup(buffer,
p.mAddress - headerSize, formatOptions);
for (int i = 0; i < info.mCharacters.length; ++i) {
pushedChars[index++] = info.mCharacters[i];
}
p.mPosition++;
final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(info.mFlags,
formatOptions);
final boolean isDeletedGroup = BinaryDictInputOutput.isDeletedGroup(info.mFlags,
formatOptions);
if (!isMovedGroup && !isDeletedGroup
&& info.mFrequency != FusionDictionary.CharGroup.NOT_A_TERMINAL) {// found word
words.put(info.mOriginalAddress, new String(pushedChars, 0, index));
frequencies.put(info.mOriginalAddress, info.mFrequency);
if (info.mBigrams != null) bigrams.put(info.mOriginalAddress, info.mBigrams);
}
if (p.mPosition == p.mNumOfCharGroup) {
if (formatOptions.mSupportsDynamicUpdate) {
final int forwardLinkAddress = buffer.readUnsignedInt24();
if (forwardLinkAddress != FormatSpec.NO_FORWARD_LINK_ADDRESS) {
// the node has a forward link.
p.mNumOfCharGroup = Position.NOT_READ_GROUPCOUNT;
p.mAddress = forwardLinkAddress;
} else {
stack.pop();
}
} else {
stack.pop();
}
} else {
// the node has more groups.
p.mAddress = buffer.position();
}
if (!isMovedGroup && BinaryDictInputOutput.hasChildrenAddress(info.mChildrenAddress)) {
Position childrenPos = new Position(info.mChildrenAddress + headerSize, index);
stack.push(childrenPos);
}
}
}
/**
* Reads unigrams and bigrams from the binary file.
* Doesn't make the memory representation of the dictionary.
*
* @param buffer the buffer to read.
* @param words the map to store the address as a key and the word as a value.
* @param frequencies the map to store the address as a key and the frequency as a value.
* @param bigrams the map to store the address as a key and the list of address as a value.
* @throws IOException
* @throws UnsupportedFormatException
*/
public static void readUnigramsAndBigramsBinary(final FusionDictionaryBufferInterface buffer,
final Map<Integer, String> words, final Map<Integer, Integer> frequencies,
final Map<Integer, ArrayList<PendingAttribute>> bigrams) throws IOException,
UnsupportedFormatException {
// Read header
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
readUnigramsAndBigramsBinaryInner(buffer, header.mHeaderSize, words, frequencies, bigrams,
header.mFormatOptions);
}
/**
* Gets the address of the last CharGroup of the exact matching word in the dictionary.
* If no match is found, returns NOT_VALID_WORD.
*
* @param buffer the buffer to read.
* @param word the word we search for.
* @return the address of the terminal node.
* @throws IOException
* @throws UnsupportedFormatException
*/
@UsedForTesting
public static int getTerminalPosition(final FusionDictionaryBufferInterface buffer,
final String word) throws IOException, UnsupportedFormatException {
if (word == null) return FormatSpec.NOT_VALID_WORD;
if (buffer.position() != 0) buffer.position(0);
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
int wordPos = 0;
final int wordLen = word.codePointCount(0, word.length());
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
if (wordPos >= wordLen) return FormatSpec.NOT_VALID_WORD;
do {
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
boolean foundNextCharGroup = false;
for (int i = 0; i < charGroupCount; ++i) {
final int charGroupPos = buffer.position();
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
buffer.position(), header.mFormatOptions);
final boolean isMovedGroup =
BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
header.mFormatOptions);
final boolean isDeletedGroup =
BinaryDictInputOutput.isDeletedGroup(currentInfo.mFlags,
header.mFormatOptions);
if (isMovedGroup) continue;
boolean same = true;
for (int p = 0, j = word.offsetByCodePoints(0, wordPos);
p < currentInfo.mCharacters.length;
++p, j = word.offsetByCodePoints(j, 1)) {
if (wordPos + p >= wordLen
|| word.codePointAt(j) != currentInfo.mCharacters[p]) {
same = false;
break;
}
}
if (same) {
// found the group matches the word.
if (wordPos + currentInfo.mCharacters.length == wordLen) {
if (currentInfo.mFrequency == CharGroup.NOT_A_TERMINAL
|| isDeletedGroup) {
return FormatSpec.NOT_VALID_WORD;
} else {
return charGroupPos;
}
}
wordPos += currentInfo.mCharacters.length;
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
return FormatSpec.NOT_VALID_WORD;
}
foundNextCharGroup = true;
buffer.position(currentInfo.mChildrenAddress);
break;
}
}
// If we found the next char group, it is under the file pointer.
// But if not, we are at the end of this node so we expect to have
// a forward link address that we need to consult and possibly resume
// search on the next node in the linked list.
if (foundNextCharGroup) break;
if (!header.mFormatOptions.mSupportsDynamicUpdate) {
return FormatSpec.NOT_VALID_WORD;
}
final int forwardLinkAddress = buffer.readUnsignedInt24();
if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
return FormatSpec.NOT_VALID_WORD;
}
buffer.position(forwardLinkAddress);
} while(true);
}
return FormatSpec.NOT_VALID_WORD;
}
private static int markAsDeleted(final int flags) {
return (flags & (~FormatSpec.MASK_GROUP_ADDRESS_TYPE)) | FormatSpec.FLAG_IS_DELETED;
}
/**
* Delete the word from the binary file.
*
* @param buffer the buffer to write.
* @param word the word we delete
* @throws IOException
* @throws UnsupportedFormatException
*/
@UsedForTesting
public static void deleteWord(final FusionDictionaryBufferInterface buffer,
final String word) throws IOException, UnsupportedFormatException {
buffer.position(0);
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
final int wordPosition = getTerminalPosition(buffer, word);
if (wordPosition == FormatSpec.NOT_VALID_WORD) return;
buffer.position(wordPosition);
final int flags = buffer.readUnsignedByte();
buffer.position(wordPosition);
buffer.put((byte)markAsDeleted(flags));
}
/**
* @return the size written, in bytes. Always 3 bytes.
*/
private static int writeSInt24ToBuffer(final FusionDictionaryBufferInterface buffer,
final int value) {
final int absValue = Math.abs(value);
buffer.put((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
buffer.put((byte)((absValue >> 8) & 0xFF));
buffer.put((byte)(absValue & 0xFF));
return 3;
}
/**
* @return the size written, in bytes. Always 3 bytes.
*/
private static int writeSInt24ToStream(final OutputStream destination, final int value)
throws IOException {
final int absValue = Math.abs(value);
destination.write((byte)(((value < 0 ? 0x80 : 0) | (absValue >> 16)) & 0xFF));
destination.write((byte)((absValue >> 8) & 0xFF));
destination.write((byte)(absValue & 0xFF));
return 3;
}
/**
* @return the size written, in bytes. 1, 2, or 3 bytes.
*/
private static int writeVariableAddress(final OutputStream destination, final int value)
throws IOException {
switch (BinaryDictInputOutput.getByteSize(value)) {
case 1:
destination.write((byte)value);
break;
case 2:
destination.write((byte)(0xFF & (value >> 8)));
destination.write((byte)(0xFF & value));
break;
case 3:
destination.write((byte)(0xFF & (value >> 16)));
destination.write((byte)(0xFF & (value >> 8)));
destination.write((byte)(0xFF & value));
break;
}
return BinaryDictInputOutput.getByteSize(value);
}
/**
* Update a parent address in a CharGroup that is referred to by groupOriginAddress.
*
* @param buffer the buffer to write.
* @param groupOriginAddress the address of the group.
* @param newParentAddress the absolute address of the parent.
* @param formatOptions file format options.
*/
public static void updateParentAddress(final FusionDictionaryBufferInterface buffer,
final int groupOriginAddress, final int newParentAddress,
final FormatOptions formatOptions) {
final int originalPosition = buffer.position();
buffer.position(groupOriginAddress);
if (!formatOptions.mSupportsDynamicUpdate) {
throw new RuntimeException("this file format does not support parent addresses");
}
final int flags = buffer.readUnsignedByte();
if (BinaryDictInputOutput.isMovedGroup(flags, formatOptions)) {
// if the group is moved, the parent address is stored in the destination group.
// We are guaranteed to process the destination group later, so there is no need to
// update anything here.
buffer.position(originalPosition);
return;
}
if (DBG) {
MakedictLog.d("update parent address flags=" + flags + ", " + groupOriginAddress);
}
final int parentOffset = newParentAddress - groupOriginAddress;
writeSInt24ToBuffer(buffer, parentOffset);
buffer.position(originalPosition);
}
private static void skipCharGroup(final FusionDictionaryBufferInterface buffer,
final FormatOptions formatOptions) {
final int flags = buffer.readUnsignedByte();
BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
BinaryDictInputOutput.readChildrenAddress(buffer, flags, formatOptions);
if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
if ((flags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS) != 0) {
final int shortcutsSize = buffer.readUnsignedShort();
buffer.position(buffer.position() + shortcutsSize
- FormatSpec.GROUP_SHORTCUT_LIST_SIZE_SIZE);
}
if ((flags & FormatSpec.FLAG_HAS_BIGRAMS) != 0) {
int bigramCount = 0;
while (bigramCount++ < FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
final int bigramFlags = buffer.readUnsignedByte();
switch (bigramFlags & FormatSpec.MASK_ATTRIBUTE_ADDRESS_TYPE) {
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE:
buffer.readUnsignedByte();
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES:
buffer.readUnsignedShort();
break;
case FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES:
buffer.readUnsignedInt24();
break;
}
if ((bigramFlags & FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT) == 0) break;
}
if (bigramCount >= FormatSpec.MAX_BIGRAMS_IN_A_GROUP) {
throw new RuntimeException("Too many bigrams in a group.");
}
}
}
/**
* Update parent addresses in a Node that is referred to by nodeOriginAddress.
*
* @param buffer the buffer to be modified.
* @param nodeOriginAddress the address of a modified Node.
* @param newParentAddress the address to be written.
* @param formatOptions file format options.
*/
public static void updateParentAddresses(final FusionDictionaryBufferInterface buffer,
final int nodeOriginAddress, final int newParentAddress,
final FormatOptions formatOptions) {
final int originalPosition = buffer.position();
buffer.position(nodeOriginAddress);
do {
final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
for (int i = 0; i < count; ++i) {
updateParentAddress(buffer, buffer.position(), newParentAddress, formatOptions);
skipCharGroup(buffer, formatOptions);
}
final int forwardLinkAddress = buffer.readUnsignedInt24();
buffer.position(forwardLinkAddress);
} while (formatOptions.mSupportsDynamicUpdate
&& buffer.position() != FormatSpec.NO_FORWARD_LINK_ADDRESS);
buffer.position(originalPosition);
}
private static void skipString(final FusionDictionaryBufferInterface buffer,
final boolean hasMultipleChars) {
if (hasMultipleChars) {
int character = CharEncoding.readChar(buffer);
while (character != FormatSpec.INVALID_CHARACTER) {
character = CharEncoding.readChar(buffer);
}
} else {
CharEncoding.readChar(buffer);
}
}
/**
* Write a string to a stream.
*
* @param destination the stream to write.
* @param word the string to be written.
* @return the size written, in bytes.
* @throws IOException
*/
private static int writeString(final OutputStream destination, final String word)
throws IOException {
int size = 0;
final int length = word.length();
for (int i = 0; i < length; i = word.offsetByCodePoints(i, 1)) {
final int codePoint = word.codePointAt(i);
if (CharEncoding.getCharSize(codePoint) == 1) {
destination.write((byte)codePoint);
size++;
} else {
destination.write((byte)(0xFF & (codePoint >> 16)));
destination.write((byte)(0xFF & (codePoint >> 8)));
destination.write((byte)(0xFF & codePoint));
size += 3;
}
}
destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
size += FormatSpec.GROUP_TERMINATOR_SIZE;
return size;
}
/**
* Update a children address in a CharGroup that is addressed by groupOriginAddress.
*
* @param buffer the buffer to write.
* @param groupOriginAddress the address of the group.
* @param newChildrenAddress the absolute address of the child.
* @param formatOptions file format options.
*/
public static void updateChildrenAddress(final FusionDictionaryBufferInterface buffer,
final int groupOriginAddress, final int newChildrenAddress,
final FormatOptions formatOptions) {
final int originalPosition = buffer.position();
buffer.position(groupOriginAddress);
final int flags = buffer.readUnsignedByte();
final int parentAddress = BinaryDictInputOutput.readParentAddress(buffer, formatOptions);
skipString(buffer, (flags & FormatSpec.FLAG_HAS_MULTIPLE_CHARS) != 0);
if ((flags & FormatSpec.FLAG_IS_TERMINAL) != 0) buffer.readUnsignedByte();
final int childrenOffset = newChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS
? FormatSpec.NO_CHILDREN_ADDRESS : newChildrenAddress - buffer.position();
writeSInt24ToBuffer(buffer, childrenOffset);
buffer.position(originalPosition);
}
/**
* Write a char group to an output stream.
* A char group is an in-memory representation of a node in trie.
* A char group info is an on-disk representation of a node.
*
* @param destination the stream to write.
* @param info the char group info to be written.
* @return the size written, in bytes.
*/
public static int writeCharGroup(final OutputStream destination, final CharGroupInfo info)
throws IOException {
int size = FormatSpec.GROUP_FLAGS_SIZE;
destination.write((byte)info.mFlags);
final int parentOffset = info.mParentAddress == FormatSpec.NO_PARENT_ADDRESS ?
FormatSpec.NO_PARENT_ADDRESS : info.mParentAddress - info.mOriginalAddress;
size += writeSInt24ToStream(destination, parentOffset);
for (int i = 0; i < info.mCharacters.length; ++i) {
if (CharEncoding.getCharSize(info.mCharacters[i]) == 1) {
destination.write((byte)info.mCharacters[i]);
size++;
} else {
size += writeSInt24ToStream(destination, info.mCharacters[i]);
}
}
if (info.mCharacters.length > 1) {
destination.write((byte)FormatSpec.GROUP_CHARACTERS_TERMINATOR);
size++;
}
if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
destination.write((byte)info.mFrequency);
size++;
}
if (DBG) {
MakedictLog.d("writeCharGroup origin=" + info.mOriginalAddress + ", size=" + size
+ ", child=" + info.mChildrenAddress + ", characters ="
+ new String(info.mCharacters, 0, info.mCharacters.length));
}
final int childrenOffset = info.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS ?
0 : info.mChildrenAddress - (info.mOriginalAddress + size);
writeSInt24ToStream(destination, childrenOffset);
size += FormatSpec.SIGNED_CHILDREN_ADDRESS_SIZE;
if (info.mShortcutTargets != null && info.mShortcutTargets.size() > 0) {
final int shortcutListSize =
BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
destination.write((byte)(shortcutListSize >> 8));
destination.write((byte)(shortcutListSize & 0xFF));
size += 2;
final Iterator<WeightedString> shortcutIterator = info.mShortcutTargets.iterator();
while (shortcutIterator.hasNext()) {
final WeightedString target = shortcutIterator.next();
destination.write((byte)BinaryDictInputOutput.makeShortcutFlags(
shortcutIterator.hasNext(), target.mFrequency));
size++;
size += writeString(destination, target.mWord);
}
}
if (info.mBigrams != null) {
// TODO: Consolidate this code with the code that computes the size of the bigram list
// in BinaryDictionaryInputOutput#computeActualNodeSize
for (int i = 0; i < info.mBigrams.size(); ++i) {
final int bigramFrequency = info.mBigrams.get(i).mFrequency;
int bigramFlags = (i < info.mBigrams.size() - 1)
? FormatSpec.FLAG_ATTRIBUTE_HAS_NEXT : 0;
size++;
final int bigramOffset = info.mBigrams.get(i).mAddress - (info.mOriginalAddress
+ size);
bigramFlags |= (bigramOffset < 0) ? FormatSpec.FLAG_ATTRIBUTE_OFFSET_NEGATIVE : 0;
switch (BinaryDictInputOutput.getByteSize(bigramOffset)) {
case 1:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_ONEBYTE;
break;
case 2:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_TWOBYTES;
break;
case 3:
bigramFlags |= FormatSpec.FLAG_ATTRIBUTE_ADDRESS_TYPE_THREEBYTES;
break;
}
bigramFlags |= bigramFrequency & FormatSpec.FLAG_ATTRIBUTE_FREQUENCY;
destination.write((byte)bigramFlags);
size += writeVariableAddress(destination, Math.abs(bigramOffset));
}
}
return size;
}
@SuppressWarnings("unused")
private static void updateForwardLink(final FusionDictionaryBufferInterface buffer,
final int nodeOriginAddress, final int newNodeAddress,
final FormatOptions formatOptions) {
buffer.position(nodeOriginAddress);
int jumpCount = 0;
while (jumpCount++ < MAX_JUMPS) {
final int count = BinaryDictInputOutput.readCharGroupCount(buffer);
for (int i = 0; i < count; ++i) skipCharGroup(buffer, formatOptions);
final int forwardLinkAddress = buffer.readUnsignedInt24();
if (forwardLinkAddress == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
buffer.position(buffer.position() - FormatSpec.FORWARD_LINK_ADDRESS_SIZE);
writeSInt24ToBuffer(buffer, newNodeAddress);
return;
}
buffer.position(forwardLinkAddress);
}
if (DBG && jumpCount >= MAX_JUMPS) {
throw new RuntimeException("too many jumps, probably a bug.");
}
}
/**
* Helper method to move a char group to the tail of the file.
*/
private static int moveCharGroup(final OutputStream destination,
final FusionDictionaryBufferInterface buffer, final CharGroupInfo info,
final int nodeOriginAddress, final int oldGroupAddress,
final FormatOptions formatOptions) throws IOException {
updateParentAddress(buffer, oldGroupAddress, buffer.limit() + 1, formatOptions);
buffer.position(oldGroupAddress);
final int currentFlags = buffer.readUnsignedByte();
buffer.position(oldGroupAddress);
buffer.put((byte)(FormatSpec.FLAG_IS_MOVED | (currentFlags
& (~FormatSpec.MASK_MOVE_AND_DELETE_FLAG))));
int size = FormatSpec.GROUP_FLAGS_SIZE;
updateForwardLink(buffer, nodeOriginAddress, buffer.limit(), formatOptions);
size += writeNode(destination, new CharGroupInfo[] { info });
return size;
}
/**
* Compute the size of the char group.
*/
private static int computeGroupSize(final CharGroupInfo info,
final FormatOptions formatOptions) {
int size = FormatSpec.GROUP_FLAGS_SIZE + FormatSpec.PARENT_ADDRESS_SIZE
+ BinaryDictInputOutput.getGroupCharactersSize(info.mCharacters)
+ BinaryDictInputOutput.getChildrenAddressSize(info.mFlags, formatOptions);
if ((info.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0) {
size += FormatSpec.GROUP_FREQUENCY_SIZE;
}
if (info.mShortcutTargets != null && !info.mShortcutTargets.isEmpty()) {
size += BinaryDictInputOutput.getShortcutListSize(info.mShortcutTargets);
}
if (info.mBigrams != null) {
for (final PendingAttribute attr : info.mBigrams) {
size += FormatSpec.GROUP_FLAGS_SIZE;
size += BinaryDictInputOutput.getByteSize(attr.mAddress);
}
}
return size;
}
/**
* Write a node to the stream.
*
* @param destination the stream to write.
* @param infos groups to be written.
* @return the size written, in bytes.
* @throws IOException
*/
private static int writeNode(final OutputStream destination, final CharGroupInfo[] infos)
throws IOException {
int size = BinaryDictInputOutput.getGroupCountSize(infos.length);
switch (BinaryDictInputOutput.getGroupCountSize(infos.length)) {
case 1:
destination.write((byte)infos.length);
break;
case 2:
destination.write((byte)(infos.length >> 8));
destination.write((byte)(infos.length & 0xFF));
break;
default:
throw new RuntimeException("Invalid group count size.");
}
for (final CharGroupInfo info : infos) size += writeCharGroup(destination, info);
writeSInt24ToStream(destination, FormatSpec.NO_FORWARD_LINK_ADDRESS);
return size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
/**
* Move a group that is referred to by oldGroupOrigin to the tail of the file.
* And set the children address to the byte after the group.
*
* @param nodeOrigin the address of the tail of the file.
* @param characters
* @param length
* @param flags
* @param frequency
* @param parentAddress
* @param shortcutTargets
* @param bigrams
* @param destination the stream representing the tail of the file.
* @param buffer the buffer representing the (constant-size) body of the file.
* @param oldNodeOrigin
* @param oldGroupOrigin
* @param formatOptions
* @return the size written, in bytes.
* @throws IOException
*/
private static int moveGroup(final int nodeOrigin, final int[] characters, final int length,
final int flags, final int frequency, final int parentAddress,
final ArrayList<WeightedString> shortcutTargets,
final ArrayList<PendingAttribute> bigrams, final OutputStream destination,
final FusionDictionaryBufferInterface buffer, final int oldNodeOrigin,
final int oldGroupOrigin, final FormatOptions formatOptions) throws IOException {
int size = 0;
final int newGroupOrigin = nodeOrigin + 1;
final int[] writtenCharacters = Arrays.copyOfRange(characters, 0, length);
final CharGroupInfo tmpInfo = new CharGroupInfo(newGroupOrigin, -1 /* endAddress */,
flags, writtenCharacters, frequency, parentAddress, FormatSpec.NO_CHILDREN_ADDRESS,
shortcutTargets, bigrams);
size = computeGroupSize(tmpInfo, formatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newGroupOrigin, newGroupOrigin + size,
flags, writtenCharacters, frequency, parentAddress,
nodeOrigin + 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE, shortcutTargets,
bigrams);
moveCharGroup(destination, buffer, newInfo, oldNodeOrigin, oldGroupOrigin, formatOptions);
return 1 + size + FormatSpec.FORWARD_LINK_ADDRESS_SIZE;
}
/**
* Insert a word into a binary dictionary.
*
* @param buffer
* @param destination
* @param word
* @param frequency
* @param bigramStrings
* @param shortcuts
* @throws IOException
* @throws UnsupportedFormatException
*/
// TODO: Support batch insertion.
// TODO: Remove @UsedForTesting once UserHistoryDictionary is implemented by BinaryDictionary.
@UsedForTesting
public static void insertWord(final FusionDictionaryBufferInterface buffer,
final OutputStream destination, final String word, final int frequency,
final ArrayList<WeightedString> bigramStrings,
final ArrayList<WeightedString> shortcuts, final boolean isNotAWord,
final boolean isBlackListEntry)
throws IOException, UnsupportedFormatException {
final ArrayList<PendingAttribute> bigrams = new ArrayList<PendingAttribute>();
if (bigramStrings != null) {
for (final WeightedString bigram : bigramStrings) {
int position = getTerminalPosition(buffer, bigram.mWord);
if (position == FormatSpec.NOT_VALID_WORD) {
// TODO: figure out what is the correct thing to do here.
} else {
bigrams.add(new PendingAttribute(bigram.mFrequency, position));
}
}
}
final boolean isTerminal = true;
final boolean hasBigrams = !bigrams.isEmpty();
final boolean hasShortcuts = shortcuts != null && !shortcuts.isEmpty();
// find the insert position of the word.
if (buffer.position() != 0) buffer.position(0);
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
int wordPos = 0, address = buffer.position(), nodeOriginAddress = buffer.position();
final int[] codePoints = FusionDictionary.getCodePoints(word);
final int wordLen = codePoints.length;
for (int depth = 0; depth < Constants.Dictionary.MAX_WORD_LENGTH; ++depth) {
if (wordPos >= wordLen) break;
nodeOriginAddress = buffer.position();
int nodeParentAddress = -1;
final int charGroupCount = BinaryDictInputOutput.readCharGroupCount(buffer);
boolean foundNextGroup = false;
for (int i = 0; i < charGroupCount; ++i) {
address = buffer.position();
final CharGroupInfo currentInfo = BinaryDictInputOutput.readCharGroup(buffer,
buffer.position(), header.mFormatOptions);
final boolean isMovedGroup = BinaryDictInputOutput.isMovedGroup(currentInfo.mFlags,
header.mFormatOptions);
if (isMovedGroup) continue;
nodeParentAddress = (currentInfo.mParentAddress == FormatSpec.NO_PARENT_ADDRESS)
? FormatSpec.NO_PARENT_ADDRESS : currentInfo.mParentAddress + address;
boolean matched = true;
for (int p = 0; p < currentInfo.mCharacters.length; ++p) {
if (wordPos + p >= wordLen) {
/*
* splitting
* before
* abcd - ef
*
* insert "abc"
*
* after
* abc - d - ef
*/
final int newNodeAddress = buffer.limit();
final int flags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
isTerminal, 0, hasShortcuts, hasBigrams, false /* isNotAWord */,
false /* isBlackListEntry */, header.mFormatOptions);
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p, flags,
frequency, nodeParentAddress, shortcuts, bigrams, destination,
buffer, nodeOriginAddress, address, header.mFormatOptions);
final int[] characters2 = Arrays.copyOfRange(currentInfo.mCharacters, p,
currentInfo.mCharacters.length);
if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
updateParentAddresses(buffer, currentInfo.mChildrenAddress,
newNodeAddress + written + 1, header.mFormatOptions);
}
final CharGroupInfo newInfo2 = new CharGroupInfo(
newNodeAddress + written + 1, -1 /* endAddress */,
currentInfo.mFlags, characters2, currentInfo.mFrequency,
newNodeAddress + 1, currentInfo.mChildrenAddress,
currentInfo.mShortcutTargets, currentInfo.mBigrams);
writeNode(destination, new CharGroupInfo[] { newInfo2 });
return;
} else if (codePoints[wordPos + p] != currentInfo.mCharacters[p]) {
if (p > 0) {
/*
* splitting
* before
* ab - cd
*
* insert "ac"
*
* after
* a - b - cd
* |
* - c
*/
final int newNodeAddress = buffer.limit();
final int childrenAddress = currentInfo.mChildrenAddress;
// move prefix
final int prefixFlags = BinaryDictInputOutput.makeCharGroupFlags(p > 1,
false /* isTerminal */, 0 /* childrenAddressSize*/,
false /* hasShortcut */, false /* hasBigrams */,
false /* isNotAWord */, false /* isBlackListEntry */,
header.mFormatOptions);
int written = moveGroup(newNodeAddress, currentInfo.mCharacters, p,
prefixFlags, -1 /* frequency */, nodeParentAddress, null, null,
destination, buffer, nodeOriginAddress, address,
header.mFormatOptions);
final int[] suffixCharacters = Arrays.copyOfRange(
currentInfo.mCharacters, p, currentInfo.mCharacters.length);
if (currentInfo.mChildrenAddress != FormatSpec.NO_CHILDREN_ADDRESS) {
updateParentAddresses(buffer, currentInfo.mChildrenAddress,
newNodeAddress + written + 1, header.mFormatOptions);
}
final int suffixFlags = BinaryDictInputOutput.makeCharGroupFlags(
suffixCharacters.length > 1,
(currentInfo.mFlags & FormatSpec.FLAG_IS_TERMINAL) != 0,
0 /* childrenAddressSize */,
(currentInfo.mFlags & FormatSpec.FLAG_HAS_SHORTCUT_TARGETS)
!= 0,
(currentInfo.mFlags & FormatSpec.FLAG_HAS_BIGRAMS) != 0,
isNotAWord, isBlackListEntry, header.mFormatOptions);
final CharGroupInfo suffixInfo = new CharGroupInfo(
newNodeAddress + written + 1, -1 /* endAddress */, suffixFlags,
suffixCharacters, currentInfo.mFrequency, newNodeAddress + 1,
currentInfo.mChildrenAddress, currentInfo.mShortcutTargets,
currentInfo.mBigrams);
written += computeGroupSize(suffixInfo, header.mFormatOptions) + 1;
final int[] newCharacters = Arrays.copyOfRange(codePoints, wordPos + p,
codePoints.length);
final int flags = BinaryDictInputOutput.makeCharGroupFlags(
newCharacters.length > 1, isTerminal,
0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, header.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(
newNodeAddress + written, -1 /* endAddress */, flags,
newCharacters, frequency, newNodeAddress + 1,
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
writeNode(destination, new CharGroupInfo[] { suffixInfo, newInfo });
return;
}
matched = false;
break;
}
}
if (matched) {
if (wordPos + currentInfo.mCharacters.length == wordLen) {
// the word exists in the dictionary.
// only update group.
final int newNodeAddress = buffer.limit();
final boolean hasMultipleChars = currentInfo.mCharacters.length > 1;
final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, header.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
-1 /* endAddress */, flags, currentInfo.mCharacters, frequency,
nodeParentAddress, currentInfo.mChildrenAddress, shortcuts,
bigrams);
moveCharGroup(destination, buffer, newInfo, nodeOriginAddress, address,
header.mFormatOptions);
return;
}
wordPos += currentInfo.mCharacters.length;
if (currentInfo.mChildrenAddress == FormatSpec.NO_CHILDREN_ADDRESS) {
/*
* found the prefix of the word.
* make new node and link to the node from this group.
*
* before
* ab - cd
*
* insert "abcde"
*
* after
* ab - cd - e
*/
final int newNodeAddress = buffer.limit();
updateChildrenAddress(buffer, address, newNodeAddress,
header.mFormatOptions);
final int newGroupAddress = newNodeAddress + 1;
final boolean hasMultipleChars = (wordLen - wordPos) > 1;
final int flags = BinaryDictInputOutput.makeCharGroupFlags(hasMultipleChars,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, header.mFormatOptions);
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
final CharGroupInfo newInfo = new CharGroupInfo(newGroupAddress, -1, flags,
characters, frequency, address, FormatSpec.NO_CHILDREN_ADDRESS,
shortcuts, bigrams);
writeNode(destination, new CharGroupInfo[] { newInfo });
return;
}
buffer.position(currentInfo.mChildrenAddress);
foundNextGroup = true;
break;
}
}
if (foundNextGroup) continue;
// reached the end of the array.
final int linkAddressPosition = buffer.position();
int nextLink = buffer.readUnsignedInt24();
if ((nextLink & MSB24) != 0) {
nextLink = -(nextLink & SINT24_MAX);
}
if (nextLink == FormatSpec.NO_FORWARD_LINK_ADDRESS) {
/*
* expand this node.
*
* before
* ab - cd
*
* insert "abef"
*
* after
* ab - cd
* |
* - ef
*/
// change the forward link address.
final int newNodeAddress = buffer.limit();
buffer.position(linkAddressPosition);
writeSInt24ToBuffer(buffer, newNodeAddress);
final int[] characters = Arrays.copyOfRange(codePoints, wordPos, wordLen);
final int flags = BinaryDictInputOutput.makeCharGroupFlags(characters.length > 1,
isTerminal, 0 /* childrenAddressSize */, hasShortcuts, hasBigrams,
isNotAWord, isBlackListEntry, header.mFormatOptions);
final CharGroupInfo newInfo = new CharGroupInfo(newNodeAddress + 1,
-1 /* endAddress */, flags, characters, frequency, nodeParentAddress,
FormatSpec.NO_CHILDREN_ADDRESS, shortcuts, bigrams);
writeNode(destination, new CharGroupInfo[]{ newInfo });
return;
} else {
depth--;
buffer.position(nextLink);
}
}
}
/**
* Find a word from the buffer.
*
* @param buffer the buffer representing the body of the dictionary file.
* @param word the word searched
* @return the found group
* @throws IOException
* @throws UnsupportedFormatException
*/
@UsedForTesting
public static CharGroupInfo findWordFromBuffer(final FusionDictionaryBufferInterface buffer,
final String word) throws IOException, UnsupportedFormatException {
int position = getTerminalPosition(buffer, word);
if (position != FormatSpec.NOT_VALID_WORD) {
buffer.position(0);
final FileHeader header = BinaryDictInputOutput.readHeader(buffer);
buffer.position(position);
return BinaryDictInputOutput.readCharGroup(buffer, position, header.mFormatOptions);
}
return null;
}
/**
* Convenience method to read the header of a binary file.
*
* This is quite resource intensive - don't call when performance is critical.
*
* @param file The file to read.
* @param offset The offset in the file where to start reading the data.
* @param length The length of the data file.
*/
private static final int HEADER_READING_BUFFER_SIZE = 16384;
public static FileHeader getDictionaryFileHeader(
final File file, final long offset, final long length)
throws FileNotFoundException, IOException, UnsupportedFormatException {
final byte[] buffer = new byte[HEADER_READING_BUFFER_SIZE];
final FileInputStream inStream = new FileInputStream(file);
try {
inStream.read(buffer);
final BinaryDictInputOutput.ByteBufferWrapper wrapper =
new BinaryDictInputOutput.ByteBufferWrapper(inStream.getChannel().map(
FileChannel.MapMode.READ_ONLY, offset, length));
return BinaryDictInputOutput.readHeader(wrapper);
} finally {
inStream.close();
}
}
public static FileHeader getDictionaryFileHeaderOrNull(final File file, final long offset,
final long length) {
try {
final FileHeader header = getDictionaryFileHeader(file, offset, length);
return header;
} catch (UnsupportedFormatException e) {
return null;
} catch (IOException e) {
return null;
}
}
}