From db1cda26539c711c3da7ed4d410dfe8190e89b8f Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 3 Aug 2019 12:12:40 -0400 Subject: convert_UTF: rewrite in C++ This allows us to namespace the symbols properly. Bug: google-breakpad:725 Change-Id: Iea8052547eef6c0acb299c1995781735c6d8994f Reviewed-on: https://chromium-review.googlesource.com/c/breakpad/breakpad/+/1769236 Reviewed-by: Mark Mentovai --- Makefile.am | 4 +- Makefile.in | 36 +- android/google_breakpad/Android.mk | 4 +- src/client/minidump_file_writer_unittest.cc | 2 +- src/client/solaris/handler/Makefile | 7 +- src/common/common.gyp | 2 +- src/common/convert_UTF.c | 554 --------------------------- src/common/convert_UTF.cc | 569 ++++++++++++++++++++++++++++ src/common/convert_UTF.h | 11 +- 9 files changed, 599 insertions(+), 590 deletions(-) delete mode 100644 src/common/convert_UTF.c create mode 100644 src/common/convert_UTF.cc diff --git a/Makefile.am b/Makefile.am index c8a57406..59788766 100644 --- a/Makefile.am +++ b/Makefile.am @@ -170,7 +170,7 @@ src_client_linux_libbreakpad_client_a_SOURCES = \ src/client/minidump_file_writer-inl.h \ src/client/minidump_file_writer.cc \ src/client/minidump_file_writer.h \ - src/common/convert_UTF.c \ + src/common/convert_UTF.cc \ src/common/convert_UTF.h \ src/common/md5.cc \ src/common/md5.h \ @@ -658,7 +658,7 @@ src_tools_mac_dump_syms_dump_syms_mac_LDADD= \ src_common_dumper_unittest_SOURCES = \ src/common/byte_cursor_unittest.cc \ - src/common/convert_UTF.c \ + src/common/convert_UTF.cc \ src/common/dwarf_cfi_to_module.cc \ src/common/dwarf_cfi_to_module_unittest.cc \ src/common/dwarf_cu_to_module.cc \ diff --git a/Makefile.in b/Makefile.in index 77b49562..2e3793ab 100644 --- a/Makefile.in +++ b/Makefile.in @@ -307,7 +307,7 @@ am__src_client_linux_libbreakpad_client_a_SOURCES_DIST = \ src/client/linux/minidump_writer/minidump_writer.cc \ src/client/minidump_file_writer-inl.h \ src/client/minidump_file_writer.cc \ - src/client/minidump_file_writer.h src/common/convert_UTF.c \ + src/client/minidump_file_writer.h src/common/convert_UTF.cc \ src/common/convert_UTF.h src/common/md5.cc src/common/md5.h \ src/common/string_conversion.cc src/common/string_conversion.h \ src/common/linux/elf_core_dump.cc src/common/linux/elfutils.cc \ @@ -689,7 +689,7 @@ src_client_linux_linux_dumper_unittest_helper_LINK = $(CXXLD) \ $(src_client_linux_linux_dumper_unittest_helper_LDFLAGS) \ $(LDFLAGS) -o $@ am__src_common_dumper_unittest_SOURCES_DIST = \ - src/common/byte_cursor_unittest.cc src/common/convert_UTF.c \ + src/common/byte_cursor_unittest.cc src/common/convert_UTF.cc \ src/common/dwarf_cfi_to_module.cc \ src/common/dwarf_cfi_to_module_unittest.cc \ src/common/dwarf_cu_to_module.cc \ @@ -2157,7 +2157,7 @@ CLEANFILES = $(am__append_12) @LINUX_HOST_TRUE@ src/client/minidump_file_writer-inl.h \ @LINUX_HOST_TRUE@ src/client/minidump_file_writer.cc \ @LINUX_HOST_TRUE@ src/client/minidump_file_writer.h \ -@LINUX_HOST_TRUE@ src/common/convert_UTF.c \ +@LINUX_HOST_TRUE@ src/common/convert_UTF.cc \ @LINUX_HOST_TRUE@ src/common/convert_UTF.h src/common/md5.cc \ @LINUX_HOST_TRUE@ src/common/md5.h \ @LINUX_HOST_TRUE@ src/common/string_conversion.cc \ @@ -2522,7 +2522,7 @@ TESTS = $(check_PROGRAMS) $(check_SCRIPTS) @DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@src_common_dumper_unittest_SOURCES = \ @DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/byte_cursor_unittest.cc \ -@DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/convert_UTF.c \ +@DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/convert_UTF.cc \ @DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/dwarf_cfi_to_module.cc \ @DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/dwarf_cfi_to_module_unittest.cc \ @DISABLE_TOOLS_FALSE@@LINUX_HOST_TRUE@ src/common/dwarf_cu_to_module.cc \ @@ -5170,20 +5170,6 @@ src/common/android/src_client_linux_linux_client_unittest_shlib-breakpad_getcont @AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'` -src/common/src_common_dumper_unittest-convert_UTF.o: src/common/convert_UTF.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT src/common/src_common_dumper_unittest-convert_UTF.o -MD -MP -MF src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo -c -o src/common/src_common_dumper_unittest-convert_UTF.o `test -f 'src/common/convert_UTF.c' || echo '$(srcdir)/'`src/common/convert_UTF.c -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='src/common/convert_UTF.c' object='src/common/src_common_dumper_unittest-convert_UTF.o' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o src/common/src_common_dumper_unittest-convert_UTF.o `test -f 'src/common/convert_UTF.c' || echo '$(srcdir)/'`src/common/convert_UTF.c - -src/common/src_common_dumper_unittest-convert_UTF.obj: src/common/convert_UTF.c -@am__fastdepCC_TRUE@ $(AM_V_CC)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT src/common/src_common_dumper_unittest-convert_UTF.obj -MD -MP -MF src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo -c -o src/common/src_common_dumper_unittest-convert_UTF.obj `if test -f 'src/common/convert_UTF.c'; then $(CYGPATH_W) 'src/common/convert_UTF.c'; else $(CYGPATH_W) '$(srcdir)/src/common/convert_UTF.c'; fi` -@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Po -@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='src/common/convert_UTF.c' object='src/common/src_common_dumper_unittest-convert_UTF.obj' libtool=no @AMDEPBACKSLASH@ -@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@ -@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o src/common/src_common_dumper_unittest-convert_UTF.obj `if test -f 'src/common/convert_UTF.c'; then $(CYGPATH_W) 'src/common/convert_UTF.c'; else $(CYGPATH_W) '$(srcdir)/src/common/convert_UTF.c'; fi` - .cc.o: @am__fastdepCXX_TRUE@ $(AM_V_CXX)depbase=`echo $@ | sed 's|[^/]*$$|$(DEPDIR)/&|;s|\.o$$||'`;\ @am__fastdepCXX_TRUE@ $(CXXCOMPILE) -MT $@ -MD -MP -MF $$depbase.Tpo -c -o $@ $< &&\ @@ -5662,6 +5648,20 @@ src/common/src_common_dumper_unittest-byte_cursor_unittest.obj: src/common/byte_ @AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ @am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o src/common/src_common_dumper_unittest-byte_cursor_unittest.obj `if test -f 'src/common/byte_cursor_unittest.cc'; then $(CYGPATH_W) 'src/common/byte_cursor_unittest.cc'; else $(CYGPATH_W) '$(srcdir)/src/common/byte_cursor_unittest.cc'; fi` +src/common/src_common_dumper_unittest-convert_UTF.o: src/common/convert_UTF.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT src/common/src_common_dumper_unittest-convert_UTF.o -MD -MP -MF src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo -c -o src/common/src_common_dumper_unittest-convert_UTF.o `test -f 'src/common/convert_UTF.cc' || echo '$(srcdir)/'`src/common/convert_UTF.cc +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/common/convert_UTF.cc' object='src/common/src_common_dumper_unittest-convert_UTF.o' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o src/common/src_common_dumper_unittest-convert_UTF.o `test -f 'src/common/convert_UTF.cc' || echo '$(srcdir)/'`src/common/convert_UTF.cc + +src/common/src_common_dumper_unittest-convert_UTF.obj: src/common/convert_UTF.cc +@am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT src/common/src_common_dumper_unittest-convert_UTF.obj -MD -MP -MF src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo -c -o src/common/src_common_dumper_unittest-convert_UTF.obj `if test -f 'src/common/convert_UTF.cc'; then $(CYGPATH_W) 'src/common/convert_UTF.cc'; else $(CYGPATH_W) '$(srcdir)/src/common/convert_UTF.cc'; fi` +@am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Tpo src/common/$(DEPDIR)/src_common_dumper_unittest-convert_UTF.Po +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ $(AM_V_CXX)source='src/common/convert_UTF.cc' object='src/common/src_common_dumper_unittest-convert_UTF.obj' libtool=no @AMDEPBACKSLASH@ +@AMDEP_TRUE@@am__fastdepCXX_FALSE@ DEPDIR=$(DEPDIR) $(CXXDEPMODE) $(depcomp) @AMDEPBACKSLASH@ +@am__fastdepCXX_FALSE@ $(AM_V_CXX@am__nodep@)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -c -o src/common/src_common_dumper_unittest-convert_UTF.obj `if test -f 'src/common/convert_UTF.cc'; then $(CYGPATH_W) 'src/common/convert_UTF.cc'; else $(CYGPATH_W) '$(srcdir)/src/common/convert_UTF.cc'; fi` + src/common/src_common_dumper_unittest-dwarf_cfi_to_module.o: src/common/dwarf_cfi_to_module.cc @am__fastdepCXX_TRUE@ $(AM_V_CXX)$(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(src_common_dumper_unittest_CPPFLAGS) $(CPPFLAGS) $(AM_CXXFLAGS) $(CXXFLAGS) -MT src/common/src_common_dumper_unittest-dwarf_cfi_to_module.o -MD -MP -MF src/common/$(DEPDIR)/src_common_dumper_unittest-dwarf_cfi_to_module.Tpo -c -o src/common/src_common_dumper_unittest-dwarf_cfi_to_module.o `test -f 'src/common/dwarf_cfi_to_module.cc' || echo '$(srcdir)/'`src/common/dwarf_cfi_to_module.cc @am__fastdepCXX_TRUE@ $(AM_V_at)$(am__mv) src/common/$(DEPDIR)/src_common_dumper_unittest-dwarf_cfi_to_module.Tpo src/common/$(DEPDIR)/src_common_dumper_unittest-dwarf_cfi_to_module.Po diff --git a/android/google_breakpad/Android.mk b/android/google_breakpad/Android.mk index 74625eb5..20a3f4f1 100644 --- a/android/google_breakpad/Android.mk +++ b/android/google_breakpad/Android.mk @@ -82,7 +82,7 @@ LOCAL_SRC_FILES := \ src/client/linux/minidump_writer/minidump_writer.cc \ src/client/minidump_file_writer.cc \ src/common/android/breakpad_getcontext.S \ - src/common/convert_UTF.c \ + src/common/convert_UTF.cc \ src/common/md5.cc \ src/common/string_conversion.cc \ src/common/linux/elfutils.cc \ @@ -100,4 +100,4 @@ LOCAL_EXPORT_LDLIBS := -llog include $(BUILD_STATIC_LIBRARY) -# Done. \ No newline at end of file +# Done. diff --git a/src/client/minidump_file_writer_unittest.cc b/src/client/minidump_file_writer_unittest.cc index 60c364e6..256e3371 100644 --- a/src/client/minidump_file_writer_unittest.cc +++ b/src/client/minidump_file_writer_unittest.cc @@ -30,7 +30,7 @@ // Author: waylonis@google.com (Dan Waylonis) /* - g++ -I../ ../common/convert_UTF.c \ + g++ -I../ ../common/convert_UTF.cc \ ../common/string_conversion.cc \ minidump_file_writer.cc \ minidump_file_writer_unittest.cc \ diff --git a/src/client/solaris/handler/Makefile b/src/client/solaris/handler/Makefile index beeb9448..6da9464e 100644 --- a/src/client/solaris/handler/Makefile +++ b/src/client/solaris/handler/Makefile @@ -40,13 +40,13 @@ BIN_DIR=. THREAD_SRC=solaris_lwp.cc SHARE_SRC=../../minidump_file_writer.cc\ + ../../../common/convert_UTF.cc\ ../../../common/md5.cc\ ../../../common/string_conversion.cc\ ../../../common/solaris/file_id.cc\ minidump_generator.cc HANDLER_SRC=exception_handler.cc\ ../../../common/solaris/guid_creator.cc -SHARE_C_SRC=../../../common/convert_UTF.c MINIDUMP_TEST_SRC=minidump_test.cc EXCEPTION_TEST_SRC=exception_handler_test.cc @@ -54,11 +54,10 @@ EXCEPTION_TEST_SRC=exception_handler_test.cc THREAD_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(THREAD_SRC)) SHARE_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(SHARE_SRC)) HANDLER_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o,$(HANDLER_SRC)) -SHARE_C_OBJ=$(patsubst %.c,$(OBJ_DIR)/%.o,$(SHARE_C_SRC)) MINIDUMP_TEST_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o, $(MINIDUMP_TEST_SRC))\ - $(THREAD_OBJ) $(SHARE_OBJ) $(SHARE_C_OBJ) $(HANDLER_OBJ) + $(THREAD_OBJ) $(SHARE_OBJ) $(HANDLER_OBJ) EXCEPTION_TEST_OBJ=$(patsubst %.cc,$(OBJ_DIR)/%.o, $(EXCEPTION_TEST_SRC))\ - $(THREAD_OBJ) $(SHARE_OBJ) $(SHARE_C_OBJ) $(HANDLER_OBJ) + $(THREAD_OBJ) $(SHARE_OBJ) $(HANDLER_OBJ) BIN=$(BIN_DIR)/minidump_test\ $(BIN_DIR)/exception_handler_test diff --git a/src/common/common.gyp b/src/common/common.gyp index fe646b47..7d5e5c7d 100644 --- a/src/common/common.gyp +++ b/src/common/common.gyp @@ -61,7 +61,7 @@ 'android/ucontext_constants.h', 'basictypes.h', 'byte_cursor.h', - 'convert_UTF.c', + 'convert_UTF.cc', 'convert_UTF.h', 'dwarf/bytereader-inl.h', 'dwarf/bytereader.cc', diff --git a/src/common/convert_UTF.c b/src/common/convert_UTF.c deleted file mode 100644 index 12a3c891..00000000 --- a/src/common/convert_UTF.c +++ /dev/null @@ -1,554 +0,0 @@ -/* - * Copyright © 1991-2015 Unicode, Inc. All rights reserved. - * Distributed under the Terms of Use in - * http://www.unicode.org/copyright.html. - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of the Unicode data files and any associated documentation - * (the "Data Files") or Unicode software and any associated documentation - * (the "Software") to deal in the Data Files or Software - * without restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, and/or sell copies of - * the Data Files or Software, and to permit persons to whom the Data Files - * or Software are furnished to do so, provided that - * (a) this copyright and permission notice appear with all copies - * of the Data Files or Software, - * (b) this copyright and permission notice appear in associated - * documentation, and - * (c) there is clear notice in each modified Data File or in the Software - * as well as in the documentation associated with the Data File(s) or - * Software that the data or software has been modified. - * - * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF - * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE - * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT OF THIRD PARTY RIGHTS. - * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS - * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL - * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, - * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER - * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR - * PERFORMANCE OF THE DATA FILES OR SOFTWARE. - * - * Except as contained in this notice, the name of a copyright holder - * shall not be used in advertising or otherwise to promote the sale, - * use or other dealings in these Data Files or Software without prior - * written authorization of the copyright holder. - */ - -/* --------------------------------------------------------------------- - -Conversions between UTF32, UTF-16, and UTF-8. Source code file. -Author: Mark E. Davis, 1994. -Rev History: Rick McGowan, fixes & updates May 2001. -Sept 2001: fixed const & error conditions per -mods suggested by S. Parent & A. Lillich. -June 2002: Tim Dodd added detection and handling of incomplete -source sequences, enhanced error detection, added casts -to eliminate compiler warnings. -July 2003: slight mods to back out aggressive FFFE detection. -Jan 2004: updated switches in from-UTF8 conversions. -Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. - -See the header file "ConvertUTF.h" for complete documentation. - ------------------------------------------------------------------------- */ - - -#include "convert_UTF.h" -#ifdef CVTUTF_DEBUG -#include -#endif - -static const int halfShift = 10; /* used for shifting by 10 bits */ - -static const UTF32 halfBase = 0x0010000UL; -static const UTF32 halfMask = 0x3FFUL; - -#define UNI_SUR_HIGH_START (UTF32)0xD800 -#define UNI_SUR_HIGH_END (UTF32)0xDBFF -#define UNI_SUR_LOW_START (UTF32)0xDC00 -#define UNI_SUR_LOW_END (UTF32)0xDFFF - -#ifndef false -#define false 0 -#endif -#ifndef true -#define true 1 -#endif - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - if (target >= targetEnd) { - result = targetExhausted; break; - } - ch = *source++; - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_LEGAL_UTF32) { - if (flags == strictConversion) { - result = sourceIllegal; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - --source; /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } -*sourceStart = source; -*targetStart = target; -return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF32* target = *targetStart; - UTF32 ch, ch2; - while (source < sourceEnd) { - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - if (target >= targetEnd) { - source = oldSource; /* Back up source pointer! */ - result = targetExhausted; break; - } - *target++ = ch; - } - *sourceStart = source; - *targetStart = target; -#ifdef CVTUTF_DEBUG - if (result == sourceIllegal) { - fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); - fflush(stderr); - } -#endif - return result; -} - -/* --------------------------------------------------------------------- */ - -/* - * Index into the table below with the first byte of a UTF-8 sequence to - * get the number of trailing bytes that are supposed to follow it. - * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is - * left as-is for anyone who may want to do such conversion, which was - * allowed in earlier algorithms. - */ -static const char trailingBytesForUTF8[256] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 -}; - -/* - * Magic values subtracted from a buffer value during UTF8 conversion. - * This table contains as many values as there might be trailing bytes - * in a UTF-8 sequence. - */ -static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, - 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; - -/* - * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed - * into the first byte, depending on how many bytes follow. There are - * as many entries in this table as there are UTF-8 sequence types. - * (I.e., one byte sequence, two byte... etc.). Remember that sequencs - * for *legal* UTF-8 will be 4 or fewer bytes total. - */ -static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; - -/* --------------------------------------------------------------------- */ - -/* The interface converts a whole buffer to avoid function-call overhead. -* Constants have been gathered. Loops & conditionals have been removed as -* much as possible for efficiency, in favor of drop-through switches. -* (See "Note A" at the bottom of the file for equivalent code.) -* If your compiler supports it, the "isLegalUTF8" call can be turned -* into an inline function. -*/ - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF16* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ - ch = *source++; - /* If we have a surrogate pair, convert to UTF32 first. */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { - /* If the 16 bits following the high surrogate are in the source buffer... */ - if (source < sourceEnd) { - UTF32 ch2 = *source; - /* If it's a low surrogate, convert to UTF32. */ - if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { - ch = ((ch - UNI_SUR_HIGH_START) << halfShift) - + (ch2 - UNI_SUR_LOW_START) + halfBase; - ++source; - } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } else { /* We don't have the 16 bits following the high surrogate. */ - --source; /* return to the high surrogate */ - result = sourceExhausted; - break; - } - } else if (flags == strictConversion) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* Figure out how many bytes the result will require */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - } - - target += bytesToWrite; - if (target > targetEnd) { - source = oldSource; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } -*sourceStart = source; -*targetStart = target; -return result; -} - -/* --------------------------------------------------------------------- */ - -/* - * Utility routine to tell whether a sequence of bytes is legal UTF-8. - * This must be called with the length pre-determined by the first byte. - * If not calling this from ConvertUTF8to*, then the length can be set by: - * length = trailingBytesForUTF8[*source]+1; - * and the sequence is illegal right away if there aren't that many bytes - * available. - * If presented with a length > 4, this returns false. The Unicode - * definition of UTF-8 goes up to 4-byte sequences. - */ - -static Boolean isLegalUTF8(const UTF8 *source, int length) { - UTF8 a; - const UTF8 *srcptr = source+length; - switch (length) { - default: return false; - /* Everything else falls through when "true"... */ - case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; - case 2: if ((a = (*--srcptr)) > 0xBF) return false; - - switch (*source) { - /* no fall-through in this inner switch */ - case 0xE0: if (a < 0xA0) return false; break; - case 0xED: if (a > 0x9F) return false; break; - case 0xF0: if (a < 0x90) return false; break; - case 0xF4: if (a > 0x8F) return false; break; - default: if (a < 0x80) return false; - } - - case 1: if (*source >= 0x80 && *source < 0xC2) return false; - } - if (*source > 0xF4) return false; - return true; -} - -/* --------------------------------------------------------------------- */ - -/* - * Exported function to return whether a UTF-8 sequence is legal or not. - * This is not used here; it's just exported. - */ -Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { - int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { - return false; - } - return isLegalUTF8(source, length); -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, - UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF16* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = (UTF16)ch; /* normal case */ - } - } else if (ch > UNI_MAX_UTF16) { - if (flags == strictConversion) { - result = sourceIllegal; - source -= (extraBytesToRead+1); /* return to the start */ - break; /* Bail out; shouldn't continue */ - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - /* target is a character in range 0xFFFF - 0x10FFFF. */ - if (target + 1 >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up source pointer! */ - result = targetExhausted; break; - } - ch -= halfBase; - *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); - *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); - } - } -*sourceStart = source; -*targetStart = target; -return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, - UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF32* source = *sourceStart; - UTF8* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch; - unsigned short bytesToWrite = 0; - const UTF32 byteMask = 0xBF; - const UTF32 byteMark = 0x80; - ch = *source++; - if (flags == strictConversion ) { - /* UTF-16 surrogate values are illegal in UTF-32 */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - --source; /* return to the illegal value itself */ - result = sourceIllegal; - break; - } - } - /* - * Figure out how many bytes the result will require. Turn any - * illegally large UTF32 things (> Plane 17) into replacement chars. - */ - if (ch < (UTF32)0x80) { bytesToWrite = 1; - } else if (ch < (UTF32)0x800) { bytesToWrite = 2; - } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; - } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; - } else { bytesToWrite = 3; - ch = UNI_REPLACEMENT_CHAR; - result = sourceIllegal; - } - - target += bytesToWrite; - if (target > targetEnd) { - --source; /* Back up source pointer! */ - target -= bytesToWrite; result = targetExhausted; break; - } - switch (bytesToWrite) { /* note: everything falls through. */ - case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; - case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); - } - target += bytesToWrite; - } -*sourceStart = source; -*targetStart = target; -return result; -} - -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (! isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} - -/* --------------------------------------------------------------------- - -Note A. -The fall-through switches in UTF-8 reading code save a -temp variable, some decrements & conditionals. The switches -are equivalent to the following loop: -{ - int tmpBytesToRead = extraBytesToRead+1; - do { - ch += *source++; - --tmpBytesToRead; - if (tmpBytesToRead) ch <<= 6; - } while (tmpBytesToRead > 0); -} -In UTF-8 writing code, the switches on "bytesToWrite" are -similarly unrolled loops. - ---------------------------------------------------------------------- */ diff --git a/src/common/convert_UTF.cc b/src/common/convert_UTF.cc new file mode 100644 index 00000000..fed04e78 --- /dev/null +++ b/src/common/convert_UTF.cc @@ -0,0 +1,569 @@ +/* + * Copyright © 1991-2015 Unicode, Inc. All rights reserved. + * Distributed under the Terms of Use in + * http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Unicode data files and any associated documentation + * (the "Data Files") or Unicode software and any associated documentation + * (the "Software") to deal in the Data Files or Software + * without restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, and/or sell copies of + * the Data Files or Software, and to permit persons to whom the Data Files + * or Software are furnished to do so, provided that + * (a) this copyright and permission notice appear with all copies + * of the Data Files or Software, + * (b) this copyright and permission notice appear in associated + * documentation, and + * (c) there is clear notice in each modified Data File or in the Software + * as well as in the documentation associated with the Data File(s) or + * Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF + * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder + * shall not be used in advertising or otherwise to promote the sale, + * use or other dealings in these Data Files or Software without prior + * written authorization of the copyright holder. + */ + +/* --------------------------------------------------------------------- + +Conversions between UTF32, UTF-16, and UTF-8. Source code file. +Author: Mark E. Davis, 1994. +Rev History: Rick McGowan, fixes & updates May 2001. +Sept 2001: fixed const & error conditions per +mods suggested by S. Parent & A. Lillich. +June 2002: Tim Dodd added detection and handling of incomplete +source sequences, enhanced error detection, added casts +to eliminate compiler warnings. +July 2003: slight mods to back out aggressive FFFE detection. +Jan 2004: updated switches in from-UTF8 conversions. +Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + +See the header file "ConvertUTF.h" for complete documentation. + +------------------------------------------------------------------------ */ + + +#include "convert_UTF.h" +#ifdef CVTUTF_DEBUG +#include +#endif + +namespace google_breakpad { + +namespace { + +const int halfShift = 10; /* used for shifting by 10 bits */ + +const UTF32 halfBase = 0x0010000UL; +const UTF32 halfMask = 0x3FFUL; + +} // namespace + +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF + +#ifndef false +#define false 0 +#endif +#ifndef true +#define true 1 +#endif + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } +*sourceStart = source; +*targetStart = target; +return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG + if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); + } +#endif + return result; +} + +/* --------------------------------------------------------------------- */ + +namespace { + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ +const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ +const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. +* Constants have been gathered. Loops & conditionals have been removed as +* much as possible for efficiency, in favor of drop-through switches. +* (See "Note A" at the bottom of the file for equivalent code.) +* If your compiler supports it, the "isLegalUTF8" call can be turned +* into an inline function. +*/ + +} // namespace + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } +*sourceStart = source; +*targetStart = target; +return result; +} + +/* --------------------------------------------------------------------- */ + +namespace { + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ +Boolean isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; +} + +} // namespace + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (source+length > sourceEnd) { + return false; + } + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } +*sourceStart = source; +*targetStart = target; +return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } +*sourceStart = source; +*targetStart = target; +return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (source + extraBytesToRead >= sourceEnd) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (! isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- + +Note A. +The fall-through switches in UTF-8 reading code save a +temp variable, some decrements & conditionals. The switches +are equivalent to the following loop: +{ + int tmpBytesToRead = extraBytesToRead+1; + do { + ch += *source++; + --tmpBytesToRead; + if (tmpBytesToRead) ch <<= 6; + } while (tmpBytesToRead > 0); +} +In UTF-8 writing code, the switches on "bytesToWrite" are +similarly unrolled loops. + +--------------------------------------------------------------------- */ + +} // namespace google_breakpad diff --git a/src/common/convert_UTF.h b/src/common/convert_UTF.h index 644d0995..2f69495d 100644 --- a/src/common/convert_UTF.h +++ b/src/common/convert_UTF.h @@ -106,6 +106,8 @@ All should be unsigned values to avoid sign extension during bit mask & shift operations. ------------------------------------------------------------------------ */ +namespace google_breakpad { + typedef unsigned long UTF32; /* at least 32 bits */ typedef unsigned short UTF16; /* at least 16 bits */ typedef unsigned char UTF8; /* typically 8 bits */ @@ -130,11 +132,6 @@ typedef enum { lenientConversion } ConversionFlags; -/* This is for C++ and does no harm in C */ -#ifdef __cplusplus -extern "C" { -#endif - ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags); @@ -155,9 +152,7 @@ ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* so Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd); -#ifdef __cplusplus -} -#endif +} // namespace google_breakpad /* --------------------------------------------------------------------- */ -- cgit v1.2.1