| From f1121648d0762cf9bf4e5117bfc1008447fb4080 Mon Sep 17 00:00:00 2001 |
| From: android |
| Date: Thu, 1 Apr 2010 11:46:35 -0700 |
| Subject: [PATCH] Add ICU support for libxml. |
| |
| This is derived from Jungshik's patch. The encoding.c is a copy from Chrome's source, |
| which has one extra modification than Jungshik's patch. |
| |
| Issue:2557315 |
| Change-Id: I8e4c9e544660f3f943a15042756f7248d5afff8e |
| --- |
| Android.mk | 4 +- |
| encoding.c | 248 +++++++++++++++++++++++++++++++++++++++++- |
| include/libxml/encoding.h | 29 +++++ |
| include/libxml/parser.h | 3 +- |
| include/libxml/xmlversion.h | 11 ++- |
| parser.c | 9 ++ |
| xmlregexp.c | 2 +- |
| 7 files changed, 294 insertions(+), 12 deletions(-) |
| |
| diff --git a/Android.mk b/Android.mk |
| index 3d0ede8..08bf11f 100644 |
| --- a/Android.mk |
| +++ b/Android.mk |
| @@ -57,7 +57,7 @@ common_C_INCLUDES += \ |
| include $(CLEAR_VARS) |
| |
| LOCAL_SRC_FILES := $(common_SRC_FILES) |
| -LOCAL_C_INCLUDES += $(common_C_INCLUDES) |
| +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common |
| LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) |
| LOCAL_CFLAGS += -fvisibility=hidden |
| |
| @@ -71,7 +71,7 @@ include $(BUILD_STATIC_LIBRARY) |
| |
| include $(CLEAR_VARS) |
| LOCAL_SRC_FILES := $(common_SRC_FILES) |
| -LOCAL_C_INCLUDES += $(common_C_INCLUDES) |
| +LOCAL_C_INCLUDES += $(common_C_INCLUDES) external/icu4c/common |
| LOCAL_SHARED_LIBRARIES += $(common_SHARED_LIBRARIES) |
| LOCAL_MODULE:= libxml2 |
| include $(BUILD_HOST_STATIC_LIBRARY) |
| diff --git a/encoding.c b/encoding.c |
| index e2df797..2abc32e 100644 |
| --- a/encoding.c |
| +++ b/encoding.c |
| @@ -58,7 +58,7 @@ static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; |
| static int xmlCharEncodingAliasesNb = 0; |
| static int xmlCharEncodingAliasesMax = 0; |
| |
| -#ifdef LIBXML_ICONV_ENABLED |
| +#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED) |
| #if 0 |
| #define DEBUG_ENCODING /* Define this to get encoding traces */ |
| #endif |
| @@ -97,6 +97,54 @@ xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val) |
| NULL, 0, val, NULL, NULL, 0, 0, msg, val); |
| } |
| |
| +#ifdef LIBXML_ICU_ENABLED |
| +static uconv_t* |
| +openIcuConverter(const char* name, int toUnicode) |
| +{ |
| + UErrorCode status = U_ZERO_ERROR; |
| + uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t)); |
| + if (conv == NULL) |
| + return NULL; |
| + |
| + conv->uconv = ucnv_open(name, &status); |
| + if (U_FAILURE(status)) |
| + goto error; |
| + |
| + status = U_ZERO_ERROR; |
| + if (toUnicode) { |
| + ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, |
| + NULL, NULL, NULL, &status); |
| + } |
| + else { |
| + ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, |
| + NULL, NULL, NULL, &status); |
| + } |
| + if (U_FAILURE(status)) |
| + goto error; |
| + |
| + status = U_ZERO_ERROR; |
| + conv->utf8 = ucnv_open("UTF-8", &status); |
| + if (U_SUCCESS(status)) |
| + return conv; |
| + |
| +error: |
| + if (conv->uconv) |
| + ucnv_close(conv->uconv); |
| + xmlFree(conv); |
| + return NULL; |
| +} |
| + |
| +static void |
| +closeIcuConverter(uconv_t *conv) |
| +{ |
| + if (conv != NULL) { |
| + ucnv_close(conv->uconv); |
| + ucnv_close(conv->utf8); |
| + xmlFree(conv); |
| + } |
| +} |
| +#endif /* LIBXML_ICU_ENABLED */ |
| + |
| /************************************************************************ |
| * * |
| * Conversions To/From UTF8 encoding * |
| @@ -1306,7 +1354,11 @@ xmlNewCharEncodingHandler(const char *name, |
| #ifdef LIBXML_ICONV_ENABLED |
| handler->iconv_in = NULL; |
| handler->iconv_out = NULL; |
| -#endif /* LIBXML_ICONV_ENABLED */ |
| +#endif |
| +#ifdef LIBXML_ICU_ENABLED |
| + handler->uconv_in = NULL; |
| + handler->uconv_out = NULL; |
| +#endif |
| |
| /* |
| * registers and returns the handler. |
| @@ -1371,7 +1423,7 @@ xmlInitCharEncodingHandlers(void) { |
| xmlNewCharEncodingHandler("ASCII", asciiToUTF8, NULL); |
| xmlNewCharEncodingHandler("US-ASCII", asciiToUTF8, NULL); |
| #endif /* LIBXML_OUTPUT_ENABLED */ |
| -#ifndef LIBXML_ICONV_ENABLED |
| +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) |
| #ifdef LIBXML_ISO8859X_ENABLED |
| xmlRegisterCharEncodingHandlersISO8859x (); |
| #endif |
| @@ -1576,6 +1628,10 @@ xmlFindCharEncodingHandler(const char *name) { |
| xmlCharEncodingHandlerPtr enc; |
| iconv_t icv_in, icv_out; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + xmlCharEncodingHandlerPtr enc; |
| + uconv_t *ucv_in, *ucv_out; |
| +#endif /* LIBXML_ICU_ENABLED */ |
| char upper[100]; |
| int i; |
| |
| @@ -1642,6 +1698,35 @@ xmlFindCharEncodingHandler(const char *name) { |
| "iconv : problems with filters for '%s'\n", name); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + /* check whether icu can handle this */ |
| + ucv_in = openIcuConverter(name, 1); |
| + ucv_out = openIcuConverter(name, 0); |
| + if (ucv_in != NULL && ucv_out != NULL) { |
| + enc = (xmlCharEncodingHandlerPtr) |
| + xmlMalloc(sizeof(xmlCharEncodingHandler)); |
| + if (enc == NULL) { |
| + closeIcuConverter(ucv_in); |
| + closeIcuConverter(ucv_out); |
| + return(NULL); |
| + } |
| + enc->name = xmlMemStrdup(name); |
| + enc->input = NULL; |
| + enc->output = NULL; |
| + enc->uconv_in = ucv_in; |
| + enc->uconv_out = ucv_out; |
| +#ifdef DEBUG_ENCODING |
| + xmlGenericError(xmlGenericErrorContext, |
| + "Found ICU converter handler for encoding %s\n", name); |
| +#endif |
| + return enc; |
| + } else if (ucv_in != NULL || ucv_out != NULL) { |
| + closeIcuConverter(ucv_in); |
| + closeIcuConverter(ucv_out); |
| + xmlEncodingErr(XML_ERR_INTERNAL_ERROR, |
| + "ICU converter : problems with filters for '%s'\n", name); |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| @@ -1732,6 +1817,75 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, |
| |
| /************************************************************************ |
| * * |
| + * ICU based generic conversion functions * |
| + * * |
| + ************************************************************************/ |
| + |
| +#ifdef LIBXML_ICU_ENABLED |
| +/** |
| + * xmlUconvWrapper: |
| + * @cd: ICU uconverter data structure |
| + * @toUnicode : non-zero if toUnicode. 0 otherwise. |
| + * @out: a pointer to an array of bytes to store the result |
| + * @outlen: the length of @out |
| + * @in: a pointer to an array of ISO Latin 1 chars |
| + * @inlen: the length of @in |
| + * |
| + * Returns 0 if success, or |
| + * -1 by lack of space, or |
| + * -2 if the transcoding fails (for *in is not valid utf8 string or |
| + * the result of transformation can't fit into the encoding we want), or |
| + * -3 if there the last byte can't form a single output char. |
| + * |
| + * The value of @inlen after return is the number of octets consumed |
| + * as the return value is positive, else unpredictable. |
| + * The value of @outlen after return is the number of ocetes consumed. |
| + */ |
| +static int |
| +xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, |
| + const unsigned char *in, int *inlen) { |
| + const char *ucv_in = (const char *) in; |
| + char *ucv_out = (char *) out; |
| + UErrorCode err = U_ZERO_ERROR; |
| + |
| + if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { |
| + if (outlen != NULL) *outlen = 0; |
| + return(-1); |
| + } |
| + |
| + /* |
| + * TODO(jungshik) |
| + * 1. is ucnv_convert(To|From)Algorithmic better? |
| + * 2. had we better use an explicit pivot buffer? |
| + * 3. error returned comes from 'fromUnicode' only even |
| + * when toUnicode is true ! |
| + */ |
| + if (toUnicode) { |
| + /* encoding => UTF-16 => UTF-8 */ |
| + ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, |
| + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, |
| + 0, TRUE, &err); |
| + } else { |
| + /* UTF-8 => UTF-16 => encoding */ |
| + ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, |
| + &ucv_in, ucv_in + *inlen, NULL, NULL, NULL, NULL, |
| + 0, TRUE, &err); |
| + } |
| + *inlen = ucv_in - (const char*) in; |
| + *outlen = ucv_out - (char *) out; |
| + if (U_SUCCESS(err)) |
| + return 0; |
| + if (err == U_BUFFER_OVERFLOW_ERROR) |
| + return -1; |
| + if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) |
| + return -2; |
| + /* if (err == U_TRUNCATED_CHAR_FOUND) */ |
| + return -3; |
| +} |
| +#endif /* LIBXML_ICU_ENABLED */ |
| + |
| +/************************************************************************ |
| + * * |
| * The real API used by libxml for on-the-fly conversion * |
| * * |
| ************************************************************************/ |
| @@ -1794,6 +1948,16 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, |
| if (ret == -1) ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_in != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) ret = -3; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| switch (ret) { |
| case 0: |
| @@ -1879,6 +2043,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, |
| ret = -3; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_in != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_in, 1, &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) |
| + ret = -3; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| switch (ret) { |
| case 0: |
| #ifdef DEBUG_ENCODING |
| @@ -1979,6 +2154,15 @@ retry: |
| out->content[out->use] = 0; |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_out != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, |
| + &out->content[out->use], |
| + &written, NULL, &toconv); |
| + out->use += written; |
| + out->content[out->use] = 0; |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| #ifdef DEBUG_ENCODING |
| xmlGenericError(xmlGenericErrorContext, |
| "initialized encoder\n"); |
| @@ -2003,7 +2187,7 @@ retry: |
| xmlBufferShrink(in, toconv); |
| out->use += written; |
| writtentot += written; |
| - } |
| + } |
| out->content[out->use] = 0; |
| } |
| #ifdef LIBXML_ICONV_ENABLED |
| @@ -2025,6 +2209,26 @@ retry: |
| } |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + else if (handler->uconv_out != NULL) { |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, |
| + &out->content[out->use], |
| + &written, in->content, &toconv); |
| + xmlBufferShrink(in, toconv); |
| + out->use += written; |
| + writtentot += written; |
| + out->content[out->use] = 0; |
| + if (ret == -1) { |
| + if (written > 0) { |
| + /* |
| + * Can be a limitation of iconv |
| + */ |
| + goto retry; |
| + } |
| + ret = -3; |
| + } |
| + } |
| +#endif /* LIBXML_ICU_ENABLED */ |
| else { |
| xmlEncodingErr(XML_I18N_NO_OUTPUT, |
| "xmlCharEncOutFunc: no output function !\n", NULL); |
| @@ -2137,6 +2341,22 @@ xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { |
| xmlFree(handler); |
| } |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + if ((handler->uconv_out != NULL) || (handler->uconv_in != NULL)) { |
| + if (handler->name != NULL) |
| + xmlFree(handler->name); |
| + handler->name = NULL; |
| + if (handler->uconv_out != NULL) { |
| + closeIcuConverter(handler->uconv_out); |
| + handler->uconv_out = NULL; |
| + } |
| + if (handler->uconv_in != NULL) { |
| + closeIcuConverter(handler->uconv_in); |
| + handler->uconv_in = NULL; |
| + } |
| + xmlFree(handler); |
| + } |
| +#endif |
| #ifdef DEBUG_ENCODING |
| if (ret) |
| xmlGenericError(xmlGenericErrorContext, |
| @@ -2212,6 +2432,22 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { |
| cur += toconv; |
| } while (ret == -2); |
| #endif |
| +#ifdef LIBXML_ICU_ENABLED |
| + } else if (handler->uconv_out != NULL) { |
| + do { |
| + toconv = in->end - cur; |
| + written = 32000; |
| + ret = xmlUconvWrapper(handler->uconv_out, 0, &convbuf[0], |
| + &written, cur, &toconv); |
| + if (ret < 0) { |
| + if (written > 0) |
| + ret = -2; |
| + else |
| + return(-1); |
| + } |
| + unused += written; |
| + cur += toconv; |
| + } while (ret == -2); |
| } else { |
| /* could not find a converter */ |
| return(-1); |
| @@ -2223,8 +2459,9 @@ xmlByteConsumed(xmlParserCtxtPtr ctxt) { |
| } |
| return(in->consumed + (in->cur - in->base)); |
| } |
| +#endif |
| |
| -#ifndef LIBXML_ICONV_ENABLED |
| +#if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) |
| #ifdef LIBXML_ISO8859X_ENABLED |
| |
| /** |
| @@ -3296,4 +3533,3 @@ xmlRegisterCharEncodingHandlersISO8859x (void) { |
| |
| #define bottom_encoding |
| #include "elfgcchack.h" |
| - |
| diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h |
| index c74b25f..c68ec10 100644 |
| --- a/include/libxml/encoding.h |
| +++ b/include/libxml/encoding.h |
| @@ -26,6 +26,24 @@ |
| |
| #ifdef LIBXML_ICONV_ENABLED |
| #include <iconv.h> |
| +#else |
| +#ifdef LIBXML_ICU_ENABLED |
| +#include <unicode/ucnv.h> |
| +#if 0 |
| +/* Forward-declare UConverter here rather than pulling in <unicode/ucnv.h> |
| + * to prevent unwanted ICU symbols being exposed to users of libxml2. |
| + * One particular case is Qt4 conflicting on UChar32. |
| + */ |
| +#include <stdint.h> |
| +struct UConverter; |
| +typedef struct UConverter UConverter; |
| +#ifdef _MSC_VER |
| +typedef wchar_t UChar; |
| +#else |
| +typedef uint16_t UChar; |
| +#endif |
| +#endif |
| +#endif |
| #endif |
| #ifdef __cplusplus |
| extern "C" { |
| @@ -125,6 +143,13 @@ typedef int (* xmlCharEncodingOutputFunc)(unsigned char *out, int *outlen, |
| * Block defining the handlers for non UTF-8 encodings. |
| * If iconv is supported, there are two extra fields. |
| */ |
| +#ifdef LIBXML_ICU_ENABLED |
| +struct _uconv_t { |
| + UConverter *uconv; /* for conversion between an encoding and UTF-16 */ |
| + UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ |
| +}; |
| +typedef struct _uconv_t uconv_t; |
| +#endif |
| |
| typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; |
| typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; |
| @@ -136,6 +161,10 @@ struct _xmlCharEncodingHandler { |
| iconv_t iconv_in; |
| iconv_t iconv_out; |
| #endif /* LIBXML_ICONV_ENABLED */ |
| +#ifdef LIBXML_ICU_ENABLED |
| + uconv_t *uconv_in; |
| + uconv_t *uconv_out; |
| +#endif /* LIBXML_ICU_ENABLED */ |
| }; |
| |
| #ifdef __cplusplus |
| diff --git a/include/libxml/parser.h b/include/libxml/parser.h |
| index 567addb..bd9de24 100644 |
| --- a/include/libxml/parser.h |
| +++ b/include/libxml/parser.h |
| @@ -276,6 +276,7 @@ struct _xmlParserCtxt { |
| int nsNr; /* the number of inherited namespaces */ |
| int nsMax; /* the size of the arrays */ |
| const xmlChar * *nsTab; /* the array of prefix/namespace name */ |
| + struct _xmlParserCtxt *nsParent; /* parent context to inherit namespaces from * */ |
| int *attallocs; /* which attribute were allocated */ |
| void * *pushTab; /* array of data for push */ |
| xmlHashTablePtr attsDefault; /* defaulted attributes if any */ |
| @@ -1213,6 +1214,7 @@ typedef enum { |
| XML_WITH_DEBUG_MEM = 29, |
| XML_WITH_DEBUG_RUN = 30, |
| XML_WITH_ZLIB = 31, |
| + XML_WITH_ICU = 32, |
| XML_WITH_NONE = 99999 /* just to be sure of allocation size */ |
| } xmlFeature; |
| |
| @@ -1223,4 +1225,3 @@ XMLPUBFUN int XMLCALL |
| } |
| #endif |
| #endif /* __XML_PARSER_H__ */ |
| - |
| diff --git a/include/libxml/xmlversion.h b/include/libxml/xmlversion.h |
| index a98e00c..fb2b8ca 100644 |
| --- a/include/libxml/xmlversion.h |
| +++ b/include/libxml/xmlversion.h |
| @@ -269,6 +269,15 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); |
| #endif |
| |
| /** |
| + * LIBXML_ICU_ENABLED: |
| + * |
| + * Whether icu support is available |
| + */ |
| +#if 1 |
| +#define LIBXML_ICU_ENABLED |
| +#endif |
| + |
| +/** |
| * LIBXML_ISO8859X_ENABLED: |
| * |
| * Whether ISO-8859-* support is made available in case iconv is not |
| @@ -454,5 +463,3 @@ XMLPUBFUN void XMLCALL xmlCheckVersion(int version); |
| } |
| #endif /* __cplusplus */ |
| #endif |
| - |
| - |
| diff --git a/parser.c b/parser.c |
| index 9db664f..306b84d 100644 |
| --- a/parser.c |
| +++ b/parser.c |
| @@ -937,6 +937,12 @@ xmlHasFeature(xmlFeature feature) |
| #else |
| return(0); |
| #endif |
| + case XML_WITH_ICU: |
| +#ifdef LIBXML_ICU_ENABLED |
| + return(1); |
| +#else |
| + return(0); |
| +#endif |
| default: |
| break; |
| } |
| @@ -8189,6 +8195,7 @@ xmlGetNamespace(xmlParserCtxtPtr ctxt, const xmlChar *prefix) { |
| return(NULL); |
| return(ctxt->nsTab[i + 1]); |
| } |
| + if (ctxt->nsParent) return xmlGetNamespace(ctxt->nsParent, prefix); |
| return(NULL); |
| } |
| |
| @@ -12538,6 +12545,8 @@ xmlParseBalancedChunkMemoryInternal(xmlParserCtxtPtr oldctxt, |
| ctxt->str_xmlns = xmlDictLookup(ctxt->dict, BAD_CAST "xmlns", 5); |
| ctxt->str_xml_ns = xmlDictLookup(ctxt->dict, XML_XML_NAMESPACE, 36); |
| |
| + ctxt->nsParent = oldctxt; |
| + |
| oldsax = ctxt->sax; |
| ctxt->sax = oldctxt->sax; |
| xmlDetectSAX2(ctxt); |
| diff --git a/xmlregexp.c b/xmlregexp.c |
| index 73598a5..4258a08 100644 |
| --- a/xmlregexp.c |
| +++ b/xmlregexp.c |
| @@ -6401,7 +6401,7 @@ xmlExpHashNameComputeKey(const xmlChar *name) { |
| if (name != NULL) { |
| value += 30 * (*name); |
| while ((ch = *name++) != 0) { |
| - value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch); |
| + value = value ^ ((value << 5) + (value >> 3) + (unsigned short)ch); |
| } |
| } |
| return (value); |
| -- |
| 1.7.0.1 |
| |