| // icu.h |
| |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| // |
| // Copyright 2005-2010 Google, Inc. |
| // Author: sorenj@google.com (Jeffrey Sorensen) |
| // roubert@google.com (Fredrik Roubert) |
| // |
| // This library implements an unrestricted Thompson/Pike UTF-8 parser and |
| // serializer. UTF-8 is a restricted subset of this byte stream encoding. See |
| // http://en.wikipedia.org/wiki/UTF-8 for a good description of the encoding |
| // details. |
| |
| #ifndef FST_LIB_ICU_H_ |
| #define FST_LIB_ICU_H_ |
| |
| #include <iostream> |
| #include <fstream> |
| #include <sstream> |
| |
| namespace fst { |
| |
| template <class Label> |
| bool UTF8StringToLabels(const string &str, vector<Label> *labels) { |
| const char *data = str.data(); |
| size_t length = str.size(); |
| for (int i = 0; i < length; /* no update */) { |
| int c = data[i++] & 0xff; |
| if ((c & 0x80) == 0) { |
| labels->push_back(c); |
| } else { |
| if ((c & 0xc0) == 0x80) { |
| LOG(ERROR) << "UTF8StringToLabels: continuation byte as lead byte"; |
| return false; |
| } |
| int count = (c >= 0xc0) + (c >= 0xe0) + (c >= 0xf0) + (c >= 0xf8) + |
| (c >= 0xfc); |
| int code = c & ((1 << (6 - count)) - 1); |
| while (count != 0) { |
| if (i == length) { |
| LOG(ERROR) << "UTF8StringToLabels: truncated utf-8 byte sequence"; |
| return false; |
| } |
| char cb = data[i++]; |
| if ((cb & 0xc0) != 0x80) { |
| LOG(ERROR) << "UTF8StringToLabels: missing/invalid continuation byte"; |
| return false; |
| } |
| code = (code << 6) | (cb & 0x3f); |
| count--; |
| } |
| if (code < 0) { |
| // This should not be able to happen. |
| LOG(ERROR) << "UTF8StringToLabels: Invalid character found: " << c; |
| return false; |
| } |
| labels->push_back(code); |
| } |
| } |
| return true; |
| } |
| |
| template <class Label> |
| bool LabelsToUTF8String(const vector<Label> &labels, string *str) { |
| ostringstream ostr; |
| for (size_t i = 0; i < labels.size(); ++i) { |
| int32_t code = labels[i]; |
| if (code < 0) { |
| LOG(ERROR) << "LabelsToUTF8String: Invalid character found: " << code; |
| return false; |
| } else if (code < 0x80) { |
| ostr << static_cast<char>(code); |
| } else if (code < 0x800) { |
| ostr << static_cast<char>((code >> 6) | 0xc0); |
| ostr << static_cast<char>((code & 0x3f) | 0x80); |
| } else if (code < 0x10000) { |
| ostr << static_cast<char>((code >> 12) | 0xe0); |
| ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); |
| ostr << static_cast<char>((code & 0x3f) | 0x80); |
| } else if (code < 0x200000) { |
| ostr << static_cast<char>((code >> 18) | 0xf0); |
| ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); |
| ostr << static_cast<char>((code & 0x3f) | 0x80); |
| } else if (code < 0x4000000) { |
| ostr << static_cast<char>((code >> 24) | 0xf8); |
| ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); |
| ostr << static_cast<char>((code & 0x3f) | 0x80); |
| } else { |
| ostr << static_cast<char>((code >> 30) | 0xfc); |
| ostr << static_cast<char>(((code >> 24) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 18) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 12) & 0x3f) | 0x80); |
| ostr << static_cast<char>(((code >> 6) & 0x3f) | 0x80); |
| ostr << static_cast<char>((code & 0x3f) | 0x80); |
| } |
| } |
| *str = ostr.str(); |
| return true; |
| } |
| |
| } // namespace fst |
| |
| #endif // FST_LIB_ICU_H_ |