diff options
author | Jens-Heiner Rechtien <hr@openoffice.org> | 2003-03-26 15:47:28 +0000 |
---|---|---|
committer | Jens-Heiner Rechtien <hr@openoffice.org> | 2003-03-26 15:47:28 +0000 |
commit | 070366678884f484afbb59f662480fe1fc83366a (patch) | |
tree | 81cbe7c7a22fbd2649fa68ae9481b0e5e434e75d /sal/textenc | |
parent | ffbf48d7ef5014de3633a4780c71a889fbde14dc (diff) |
MWS_SRX644: migrate branch mws_srx644 -> HEAD
Diffstat (limited to 'sal/textenc')
-rw-r--r-- | sal/textenc/makefile.mk | 8 | ||||
-rw-r--r-- | sal/textenc/tcvtest1.tab | 162 | ||||
-rw-r--r-- | sal/textenc/tcvtjp6.tab | 9 | ||||
-rw-r--r-- | sal/textenc/tcvtkr6.tab | 42 | ||||
-rw-r--r-- | sal/textenc/tcvtlat1.tab | 20 | ||||
-rw-r--r-- | sal/textenc/tcvtmb.c | 27 | ||||
-rw-r--r-- | sal/textenc/tcvtscn6.tab | 9 | ||||
-rw-r--r-- | sal/textenc/tcvttcn6.tab | 7 | ||||
-rw-r--r-- | sal/textenc/tcvtuni1.tab | 24 | ||||
-rw-r--r-- | sal/textenc/tcvtutf8.c | 625 | ||||
-rw-r--r-- | sal/textenc/tenchelp.h | 33 | ||||
-rw-r--r-- | sal/textenc/tencinfo.c | 9 |
12 files changed, 609 insertions, 366 deletions
diff --git a/sal/textenc/makefile.mk b/sal/textenc/makefile.mk index 0a7018f8d..adedc551a 100644 --- a/sal/textenc/makefile.mk +++ b/sal/textenc/makefile.mk @@ -2,9 +2,9 @@ # # $RCSfile: makefile.mk,v $ # -# $Revision: 1.8 $ +# $Revision: 1.9 $ # -# last change: $Author: sb $ $Date: 2002-01-15 17:01:35 $ +# last change: $Author: hr $ $Date: 2003-03-26 16:47:12 $ # # The Contents of this file are made available subject to the terms of # either of the following licenses @@ -84,7 +84,7 @@ SLOFILES = \ $(SLO)$/textenc.obj \ $(SLO)$/unichars.obj -.IF "$(UPDATER)" != "" +#.IF "$(UPDATER)" != "" OBJFILES = \ $(OBJ)$/context.obj \ @@ -105,7 +105,7 @@ OBJFILES = \ $(OBJ)$/textenc.obj \ $(OBJ)$/unichars.obj -.ENDIF # UPDATER +#.ENDIF # UPDATER # Optimization off on Solaris Intel due to internal compiler error; to be # reevaluated after compiler upgrade: diff --git a/sal/textenc/tcvtest1.tab b/sal/textenc/tcvtest1.tab index 2bd815519..ca1f039c1 100644 --- a/sal/textenc/tcvtest1.tab +++ b/sal/textenc/tcvtest1.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtest1.tab,v $ * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * - * last change: $Author: sb $ $Date: 2002-12-12 12:29:14 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:12 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -4300,3 +4300,161 @@ static ImplTextEncodingData const aImplKOI8RTextEncodingData "koi8-r", RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; /* SCRIPT_CYRILLIC, pc code page 866 (855?), mac encoding 7 */ + +/* ======================================================================= */ + +/* KOI8_U */ +/* Standard Charset for Ukrainian Cyrillic */ +/* Single Byte, 0x00-0x7F equals ASCII */ +/* Conversion Tables: hand made */ + +#define KOI8UUNI_START 0x80 +#define KOI8UUNI_END 0xFF +static sal_uInt16 const aImplKOI8UToUniTab[KOI8UUNI_END - KOI8UUNI_START + 1] = +{ +/* 0 1 2 3 4 5 6 7 */ +/* 8 9 A B C D E F */ + 0x2500, 0x2502, 0x250C, 0x2510, 0x2514, 0x2518, 0x251C, 0x2524, /* 0x80 */ + 0x252C, 0x2534, 0x253C, 0x2580, 0x2584, 0x2588, 0x258C, 0x2590, /* 0x80 */ + 0x2591, 0x2592, 0x2593, 0x2320, 0x25A0, 0x2219, 0x221A, 0x2248, /* 0x90 */ + 0x2264, 0x2265, 0x00A0, 0x2321, 0x00B0, 0x00B2, 0x00B7, 0x00F7, /* 0x90 */ + 0x2550, 0x2551, 0x2552, 0x0451, 0x0454, 0x2554, 0x0456, 0x0457, /* 0xA0 */ + 0x2557, 0x2558, 0x2559, 0x255A, 0x255B, 0x0491, 0x255D, 0x255E, /* 0xA0 */ + 0x255F, 0x2560, 0x2561, 0x0401, 0x0404, 0x2563, 0x0406, 0x0407, /* 0xB0 */ + 0x2566, 0x2567, 0x2568, 0x2569, 0x256A, 0x0490, 0x256C, 0x00A9, /* 0xB0 */ + 0x044E, 0x0430, 0x0431, 0x0446, 0x0434, 0x0435, 0x0444, 0x0433, /* 0xC0 */ + 0x0445, 0x0438, 0x0439, 0x043A, 0x043B, 0x043C, 0x043D, 0x043E, /* 0xC0 */ + 0x043F, 0x044F, 0x0440, 0x0441, 0x0442, 0x0443, 0x0436, 0x0432, /* 0xD0 */ + 0x044C, 0x044B, 0x0437, 0x0448, 0x044D, 0x0449, 0x0447, 0x044A, /* 0xD0 */ + 0x042E, 0x0410, 0x0411, 0x0426, 0x0414, 0x0415, 0x0424, 0x0413, /* 0xE0 */ + 0x0425, 0x0418, 0x0419, 0x041A, 0x041B, 0x041C, 0x041D, 0x041E, /* 0xE0 */ + 0x041F, 0x042F, 0x0420, 0x0421, 0x0422, 0x0423, 0x0416, 0x0412, /* 0xF0 */ + 0x042C, 0x042B, 0x0417, 0x0428, 0x042D, 0x0429, 0x0427, 0x042A /* 0xF0 */ +}; + +#define KOI8UCHAR_START 0x0410 +#define KOI8UCHAR_END 0x044F +static sal_uChar const +aImplKOI8UToCharTab[KOI8UCHAR_END - KOI8UCHAR_START + 1] = +{ +/* 0 1 2 3 4 5 6 7 */ +/* 8 9 A B C D E F */ + 0xE1, 0xE2, 0xF7, 0xE7, 0xE4, 0xE5, 0xF6, 0xFA, /* 0x0410 */ + 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xF0, /* 0x0410 */ + 0xF2, 0xF3, 0xF4, 0xF5, 0xE6, 0xE8, 0xE3, 0xFE, /* 0x0420 */ + 0xFB, 0xFD, 0xFF, 0xF9, 0xF8, 0xFC, 0xE0, 0xF1, /* 0x0420 */ + 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xD6, 0xDA, /* 0x0430 */ + 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF, 0xD0, /* 0x0430 */ + 0xD2, 0xD3, 0xD4, 0xD5, 0xC6, 0xC8, 0xC3, 0xDE, /* 0x0440 */ + 0xDB, 0xDD, 0xDF, 0xD9, 0xD8, 0xDC, 0xC0, 0xD1 /* 0x0440 */ +}; + +#define KOI8UTOCHARTABEX_COUNT 72 +static ImplUniCharTabData const aImplKOI8UToCharTabEx[KOI8UTOCHARTABEX_COUNT] = +{ + { 0x00A0, 0x9A }, + { 0x00A9, 0xBF }, + { 0x00B0, 0x9C }, + { 0x00B2, 0x9D }, + { 0x00B7, 0x9E }, + { 0x00F7, 0x9F }, + { 0x0401, 0xB3 }, + { 0x0404, 0xB4 }, + { 0x0406, 0xB6 }, + { 0x0407, 0xB7 }, + { 0x0451, 0xA3 }, + { 0x0454, 0xA4 }, + { 0x0456, 0xA6 }, + { 0x0457, 0xA7 }, + { 0x0490, 0xBD }, + { 0x0491, 0xAD }, + { 0x2219, 0x95 }, + { 0x221A, 0x96 }, + { 0x2248, 0x97 }, + { 0x2264, 0x98 }, + { 0x2265, 0x99 }, + { 0x2320, 0x93 }, + { 0x2321, 0x9B }, + { 0x2500, 0x80 }, + { 0x2502, 0x81 }, + { 0x250C, 0x82 }, + { 0x2510, 0x83 }, + { 0x2514, 0x84 }, + { 0x2518, 0x85 }, + { 0x251C, 0x86 }, + { 0x2524, 0x87 }, + { 0x252C, 0x88 }, + { 0x2534, 0x89 }, + { 0x253C, 0x8A }, + { 0x2550, 0xA0 }, + { 0x2551, 0xA1 }, + { 0x2552, 0xA2 }, + { 0x2553, 0xA4 }, + { 0x2554, 0xA5 }, + { 0x2555, 0xA6 }, + { 0x2556, 0xA7 }, + { 0x2557, 0xA8 }, + { 0x2558, 0xA9 }, + { 0x2559, 0xAA }, + { 0x255A, 0xAB }, + { 0x255B, 0xAC }, + { 0x255C, 0xAD }, + { 0x255D, 0xAE }, + { 0x255E, 0xAF }, + { 0x255F, 0xB0 }, + { 0x2560, 0xB1 }, + { 0x2561, 0xB2 }, + { 0x2562, 0xB4 }, + { 0x2563, 0xB5 }, + { 0x2564, 0xB6 }, + { 0x2565, 0xB7 }, + { 0x2566, 0xB8 }, + { 0x2567, 0xB9 }, + { 0x2568, 0xBA }, + { 0x2569, 0xBB }, + { 0x256A, 0xBC }, + { 0x256B, 0xBD }, + { 0x256C, 0xBE }, + { 0x2580, 0x8B }, + { 0x2584, 0x8C }, + { 0x2588, 0x8D }, + { 0x258C, 0x8E }, + { 0x2590, 0x8F }, + { 0x2591, 0x90 }, + { 0x2592, 0x91 }, + { 0x2593, 0x92 }, + { 0x25A0, 0x94 } +}; + +static ImplByteConvertData const aImplKOI8UByteCvtData = +{ + aImplKOI8UToUniTab, + NULL, + KOI8UUNI_START, KOI8UUNI_END, + NOTABUNI_START, NOTABUNI_END, + aImplKOI8UToCharTab, + NULL, + aImplKOI8UToCharTabEx, + KOI8UCHAR_START, KOI8UCHAR_END, + NOTABCHAR_START, NOTABCHAR_END, + KOI8UTOCHARTABEX_COUNT +}; + +static ImplTextEncodingData const aImplKoi8UTextEncodingData + = { { &aImplKOI8UByteCvtData, + ImplCharToUnicode, + ImplUnicodeToChar, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL }, + 1, + 1, + 1, + 204, + "koi8-u", + "KOI8-U", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; + /* SCRIPT_CYRILLIC */ diff --git a/sal/textenc/tcvtjp6.tab b/sal/textenc/tcvtjp6.tab index 05fb82faf..53e042d8d 100644 --- a/sal/textenc/tcvtjp6.tab +++ b/sal/textenc/tcvtjp6.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtjp6.tab,v $ * - * $Revision: 1.9 $ + * $Revision: 1.10 $ * - * last change: $Author: sb $ $Date: 2002-12-10 10:14:01 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:13 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -609,6 +609,7 @@ static ImplDBCSConvertData const aImplSJISDBCSCvtData = { aSJISUniLeadTab, aSJISDBCSHighTab, + 0x00, 0xFF, 0x40, 0xFC, aImplSJISDBCSEUDCTab, DBCS_EUDC_SJIS_COUNT @@ -1165,6 +1166,7 @@ static ImplDBCSConvertData const aImplMS932DBCSCvtData = { aMS932UniLeadTab, aMS932DBCSHighTab, + 0x00, 0xFF, 0x40, 0xFC, aImplSJISDBCSEUDCTab, DBCS_EUDC_SJIS_COUNT @@ -1721,6 +1723,7 @@ static ImplDBCSConvertData const aImplAPPLEJAPANESEDBCSCvtData = { aAPPLEJAPANESEUniLeadTab, aAPPLEJAPANESEDBCSHighTab, + 0x00, 0xFF, 0x40, 0xFC, aImplSJISDBCSEUDCTab, DBCS_EUDC_SJIS_COUNT @@ -2630,6 +2633,7 @@ static ImplDBCSConvertData const aImplJISX0208DBCSCvtData = { NULL, aJIS0208DBCSHighTab, + 0x00, 0xFF, 0x21, 0x7E, NULL, 0 @@ -2662,6 +2666,7 @@ static ImplDBCSConvertData const aImplJISX0212DBCSCvtData = { NULL, aJIS0212DBCSHighTab, + 0x00, 0xFF, 0x21, 0x7E, NULL, 0 diff --git a/sal/textenc/tcvtkr6.tab b/sal/textenc/tcvtkr6.tab index 5b3e93604..fe5510f88 100644 --- a/sal/textenc/tcvtkr6.tab +++ b/sal/textenc/tcvtkr6.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtkr6.tab,v $ * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * - * last change: $Author: sb $ $Date: 2002-12-10 10:14:02 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:13 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -243,8 +243,8 @@ static ImplDBCSToUniLeadTab const aKSC5601UniLeadTab[256] = { 0, 0x41, 0xF6, aImplDBCSToUniTab_KSC5601_AB }, /* 0xAB */ { 0, 0x41, 0xF1, aImplDBCSToUniTab_KSC5601_AC }, /* 0xAC */ { 0, 0x41, 0xA0, aImplDBCSToUniTab_KSC5601_AD }, /* 0xAD */ - { 0, 0x41, 0xA1, aImplDBCSToUniTab_KSC5601_AE }, /* 0xAE */ - { 0, 0x41, 0xA1, aImplDBCSToUniTab_KSC5601_AF }, /* 0xAF */ + { 0, 0x41, 0xA0, aImplDBCSToUniTab_KSC5601_AE }, /* 0xAE */ + { 0, 0x41, 0xA0, aImplDBCSToUniTab_KSC5601_AF }, /* 0xAF */ { 0, 0x41, 0xFE, aImplDBCSToUniTab_KSC5601_B0 }, /* 0xB0 */ { 0, 0x41, 0xFE, aImplDBCSToUniTab_KSC5601_B1 }, /* 0xB1 */ { 0, 0x41, 0xFE, aImplDBCSToUniTab_KSC5601_B2 }, /* 0xB2 */ @@ -270,7 +270,7 @@ static ImplDBCSToUniLeadTab const aKSC5601UniLeadTab[256] = { 0, 0x41, 0xFE, aImplDBCSToUniTab_KSC5601_C6 }, /* 0xC6 */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_C7 }, /* 0xC7 */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_C8 }, /* 0xC8 */ - { 0, 1, 0, aImplDoubleByteIdentifierTab }, /* 0xC9 */ /* We convert this as double byte */ + { 0, 1, 0, aImplDoubleByteIdentifierTab }, /* 0xC9 */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_CA }, /* 0xCA */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_CB }, /* 0xCB */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_CC }, /* 0xCC */ @@ -323,7 +323,7 @@ static ImplDBCSToUniLeadTab const aKSC5601UniLeadTab[256] = { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_FB }, /* 0xFB */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_FC }, /* 0xFC */ { 0, 0xA1, 0xFE, aImplDBCSToUniTab_KSC5601_FD }, /* 0xFD */ - { 0, 1, 0, NULL }, /* 0xFE */ + { 0, 1, 0, aImplDoubleByteIdentifierTab }, /* 0xFE */ { 0, 1, 0, NULL } /* 0xFF */ }; @@ -600,10 +600,10 @@ static ImplDBCSEUDCData const aImplMS949DBCSEUDCTab[DBCS_EUDC_MS949_COUNT] = /* EUDC-Range */ /* Lead: C9-C9 */ /* Trail: A1-FE */ - { 0xC9, 0xC9, 0xA1, 0xFE, 0, 0, 0, 0, 1, 0x5D, 0xE000, 0xE05D }, + { 0xC9, 0xC9, 0xA1, 0xFE, 0, 0, 0, 0, 1, 0x5E, 0xE000, 0xE05D }, /* Lead: FE-FE */ /* Trail: A1-FE */ - { 0xFE, 0xFE, 0xA1, 0xFE, 0, 0, 0, 0, 1, 0x5D, 0xE05E, 0xE0BB } + { 0xFE, 0xFE, 0xA1, 0xFE, 0, 0, 0, 0, 1, 0x5E, 0xE05E, 0xE0BB } }; /* ----------------------------------------------------------------------- */ @@ -612,6 +612,19 @@ static ImplDBCSConvertData const aImplKSC5601DBCSCvtData = { aKSC5601UniLeadTab, aKSC5601DBCSHighTab, + 0xA1, 0xFE, + 0xA1, 0xFE, + aImplMS949DBCSEUDCTab, + DBCS_EUDC_MS949_COUNT +}; + +/* ----------------------------------------------------------------------- */ + +static ImplDBCSConvertData const aImplMS949DBCSCvtData = +{ + aKSC5601UniLeadTab, + aKSC5601DBCSHighTab, + 0x00, 0xFF, 0x41, 0xFE, aImplMS949DBCSEUDCTab, DBCS_EUDC_MS949_COUNT @@ -619,10 +632,6 @@ static ImplDBCSConvertData const aImplKSC5601DBCSCvtData = /* ======================================================================= */ -// TODO EUC-KR should only use two-byte characters in the range 0xA1--FE -// 0xA1--FE, and thus should have the RTL_TEXTENCODING_INFO_ASCII property. For -// whatever reason, EUC-KR is treated like MS-949 that uses trail bytes starting -// from 0x41. static ImplTextEncodingData const aImplEUCKRTextEncodingData = { { &aImplKSC5601DBCSCvtData, ImplDBCSToUnicode, @@ -639,13 +648,15 @@ static ImplTextEncodingData const aImplEUCKRTextEncodingData 129, "euc-kr", "euc-kr", - RTL_TEXTENCODING_INFO_MULTIBYTE | RTL_TEXTENCODING_INFO_MIME }; + RTL_TEXTENCODING_INFO_ASCII + | RTL_TEXTENCODING_INFO_MULTIBYTE + | RTL_TEXTENCODING_INFO_MIME }; /* SCRIPT_KOREAN, pc code page 934, mac encoding 3 */ /* ======================================================================= */ static ImplTextEncodingData const aImplMS949TextEncodingData - = { { &aImplKSC5601DBCSCvtData, + = { { &aImplMS949DBCSCvtData, ImplDBCSToUnicode, ImplUnicodeToDBCS, NULL, @@ -668,7 +679,7 @@ static ImplTextEncodingData const aImplMS949TextEncodingData /* Apple has some extension, which we don't support */ static ImplTextEncodingData const aImplAPPLEKOREANTextEncodingData - = { { &aImplKSC5601DBCSCvtData, + = { { &aImplMS949DBCSCvtData, ImplDBCSToUnicode, ImplUnicodeToDBCS, NULL, @@ -1233,6 +1244,7 @@ static ImplDBCSConvertData const aImplJOHABDBCSCvtData = { aJOHABUniLeadTab, aJOHABDBCSHighTab, + 0x00, 0xFF, 0x31, 0xFE, aImplMS1361DBCSEUDCTab, DBCS_EUDC_MS1361_COUNT diff --git a/sal/textenc/tcvtlat1.tab b/sal/textenc/tcvtlat1.tab index 2379d9876..193483fb1 100644 --- a/sal/textenc/tcvtlat1.tab +++ b/sal/textenc/tcvtlat1.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtlat1.tab,v $ * - * $Revision: 1.5 $ + * $Revision: 1.6 $ * - * last change: $Author: sb $ $Date: 2002-12-12 12:29:14 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:14 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -542,8 +542,8 @@ static ImplTextEncodingData const aImplIBM860TextEncodingData 1, 0, "iso8859-1", - "iso-8859-1", - RTL_TEXTENCODING_INFO_ASCII }; + "IBM860", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; /* DOS/OS2, SCRIPT_LATIN, pc code page 860 */ /* ======================================================================= */ @@ -717,8 +717,8 @@ static ImplTextEncodingData const aImplIBM861TextEncodingData 1, 0, "iso8859-1", - "iso-8859-1", - RTL_TEXTENCODING_INFO_ASCII }; + "IBM861", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; /* DOS/OS2, SCRIPT_LATIN, pc code page 861, mac encoding 37 */ /* ======================================================================= */ @@ -892,8 +892,8 @@ static ImplTextEncodingData const aImplIBM863TextEncodingData 1, 0, "iso8859-1", - "iso-8859-1", - RTL_TEXTENCODING_INFO_ASCII }; + "IBM863", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; /* DOS/OS2, SCRIPT_LATIN, pc code page 863 */ /* ======================================================================= */ @@ -1067,8 +1067,8 @@ static ImplTextEncodingData const aImplIBM865TextEncodingData 1, 0, "iso8859-1", - "iso-8859-1", - RTL_TEXTENCODING_INFO_ASCII }; + "IBM865", + RTL_TEXTENCODING_INFO_ASCII | RTL_TEXTENCODING_INFO_MIME }; /* DOS/OS2, SCRIPT_LATIN, pc code page 865 */ /* ======================================================================= */ diff --git a/sal/textenc/tcvtmb.c b/sal/textenc/tcvtmb.c index 93a40a390..c4c6ed12d 100644 --- a/sal/textenc/tcvtmb.c +++ b/sal/textenc/tcvtmb.c @@ -2,9 +2,9 @@ * * $RCSfile: tcvtmb.c,v $ * - * $Revision: 1.9 $ + * $Revision: 1.10 $ * - * last change: $Author: sb $ $Date: 2002-11-06 10:06:03 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:14 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -105,7 +105,9 @@ sal_Size ImplDBCSToUnicode( const ImplTextConverterData* pData, void* pContext, pLeadEntry = pLeadTab+cLead; /* SingleByte char? */ - if ( !pLeadEntry->mpToUniTrailTab ) + if (pLeadEntry->mpToUniTrailTab == NULL + || cLead < pConvertData->mnLeadStart + || cLead > pConvertData->mnLeadEnd) { cConv = pLeadEntry->mnUniChar; if ( !cConv && (cLead != 0) ) @@ -202,7 +204,10 @@ sal_Size ImplDBCSToUnicode( const ImplTextConverterData* pData, void* pContext, /* moeglich auch richtig zu behandeln, das double byte */ /* characters auch als ein einzelner Character behandelt */ /* wird. */ - if ( (cTrail < pConvertData->mnTrailStart) || (cTrail > pConvertData->mnTrailEnd) ) + if (cLead < pConvertData->mnLeadStart + || cLead > pConvertData->mnLeadEnd + || cTrail < pConvertData->mnTrailStart + || cTrail > pConvertData->mnTrailEnd) { *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) @@ -271,6 +276,12 @@ sal_Size ImplUnicodeToDBCS( const ImplTextConverterData* pData, void* pContext, sal_Char* pEndDestBuf; const sal_Unicode* pEndSrcBuf; + sal_Bool bCheckRange = (pConvertData->mnLeadStart != 0 + || pConvertData->mnLeadEnd != 0xFF); + /* this statement has the effect that this extra check is only done for + EUC-KR, which uses the MS-949 tables, but does not support the full + range of MS-949 */ + *pInfo = 0; pEndDestBuf = pDestBuf+nDestBytes; pEndSrcBuf = pSrcBuf+nSrcChars; @@ -285,7 +296,15 @@ sal_Size ImplUnicodeToDBCS( const ImplTextConverterData* pData, void* pContext, /* is low byte in the table range */ if ( (nLowChar >= pHighEntry->mnLowStart) && (nLowChar <= pHighEntry->mnLowEnd) ) + { cConv = pHighEntry->mpToUniTrailTab[nLowChar-pHighEntry->mnLowStart]; + if (bCheckRange && cConv > 0x7F + && ((cConv >> 8) < pConvertData->mnLeadStart + || (cConv >> 8) > pConvertData->mnLeadEnd + || (cConv & 0xFF) < pConvertData->mnTrailStart + || (cConv & 0xFF) > pConvertData->mnTrailEnd)) + cConv = 0; + } else cConv = 0; diff --git a/sal/textenc/tcvtscn6.tab b/sal/textenc/tcvtscn6.tab index 8bed3a1cb..6a4b23ce7 100644 --- a/sal/textenc/tcvtscn6.tab +++ b/sal/textenc/tcvtscn6.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtscn6.tab,v $ * - * $Revision: 1.9 $ + * $Revision: 1.10 $ * - * last change: $Author: sb $ $Date: 2002-12-10 10:14:02 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:14 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -622,6 +622,7 @@ static ImplDBCSConvertData const aImplGB2312DBCSCvtData = { aGB2312UniLeadTab, aGB2312DBCSHighTab, + 0x00, 0xFF, 0xA1, 0xFE, aImplMS936DBCSEUDCTab, DBCS_EUDC_MS936_COUNT @@ -1201,6 +1202,7 @@ static ImplDBCSConvertData const aImplGBT12345DBCSCvtData = { aGBT12345UniLeadTab, aGBT12345DBCSHighTab, + 0x00, 0xFF, 0xA1, 0xFE, aImplMS936DBCSEUDCTab, DBCS_EUDC_MS936_COUNT @@ -1845,6 +1847,7 @@ static ImplDBCSConvertData const aImplGBKDBCSCvtData = { aGBKUniLeadTab, aGBKDBCSHighTab, + 0x00, 0xFF, 0x40, 0xFE, aImplMS936DBCSEUDCTab, DBCS_EUDC_MS936_COUNT @@ -2401,6 +2404,7 @@ static ImplDBCSConvertData const aImplMS936DBCSCvtData = { aMS936UniLeadTab, aMS936DBCSHighTab, + 0x00, 0xFF, 0x40, 0xFE, aImplMS936DBCSEUDCTab, DBCS_EUDC_MS936_COUNT @@ -2958,6 +2962,7 @@ static ImplDBCSConvertData const aImplAPPLECHINSIMPDBCSCvtData = { aAPPLECHINSIMPUniLeadTab, aAPPLECHINSIMPDBCSHighTab, + 0x00, 0xFF, 0xA1, 0xFE, NULL, 0 diff --git a/sal/textenc/tcvttcn6.tab b/sal/textenc/tcvttcn6.tab index 02123d090..a224c28d9 100644 --- a/sal/textenc/tcvttcn6.tab +++ b/sal/textenc/tcvttcn6.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvttcn6.tab,v $ * - * $Revision: 1.8 $ + * $Revision: 1.9 $ * - * last change: $Author: sb $ $Date: 2002-12-10 10:14:02 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:15 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -624,6 +624,7 @@ static ImplDBCSConvertData const aImplBIG5DBCSCvtData = { aBIG5UniLeadTab, aBIG5DBCSHighTab, + 0x00, 0xFF, 0x40, 0xFE, aImplMS950DBCSEUDCTab, DBCS_EUDC_MS950_COUNT @@ -1180,6 +1181,7 @@ static ImplDBCSConvertData const aImplMS950DBCSCvtData = { aMS950UniLeadTab, aMS950DBCSHighTab, + 0x00, 0xFF, 0x40, 0xFE, aImplMS950DBCSEUDCTab, DBCS_EUDC_MS950_COUNT @@ -1737,6 +1739,7 @@ static ImplDBCSConvertData const aImplAPPLECHINTRADDBCSCvtData = { aAPPLECHINTRADUniLeadTab, aAPPLECHINTRADDBCSHighTab, + 0x00, 0xFF, 0x40, 0xFE, NULL, 0 diff --git a/sal/textenc/tcvtuni1.tab b/sal/textenc/tcvtuni1.tab index e7a3247f4..29fad070d 100644 --- a/sal/textenc/tcvtuni1.tab +++ b/sal/textenc/tcvtuni1.tab @@ -2,9 +2,9 @@ * * $RCSfile: tcvtuni1.tab,v $ * - * $Revision: 1.3 $ + * $Revision: 1.4 $ * - * last change: $Author: sb $ $Date: 2002-10-04 13:37:06 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:15 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -59,6 +59,10 @@ * ************************************************************************/ +#ifndef INCLUDED_RTL_TEXTENC_CONTEXT_H +#include "context.h" +#endif + /* Unicode Encodings */ static ImplTextEncodingData const aImplUTF7TextEncodingData @@ -86,14 +90,14 @@ static ImplTextEncodingData const aImplUTF7TextEncodingData static ImplTextEncodingData const aImplUTF8TextEncodingData = { { NULL, - ImplUTF8ToUnicode, - ImplUnicodeToUTF8, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL }, + &ImplConvertUtf8ToUnicode, + &ImplConvertUnicodeToUtf8, + &ImplCreateUtf8ToUnicodeContext, + &ImplDestroyContext, + &ImplResetUtf8ToUnicodeContext, + &ImplCreateUnicodeToUtf8Context, + &ImplDestroyContext, + &ImplResetUnicodeToUtf8Context }, 1, 6, 1, diff --git a/sal/textenc/tcvtutf8.c b/sal/textenc/tcvtutf8.c index f1b01d62e..b6cde12b5 100644 --- a/sal/textenc/tcvtutf8.c +++ b/sal/textenc/tcvtutf8.c @@ -2,9 +2,9 @@ * * $RCSfile: tcvtutf8.c,v $ * - * $Revision: 1.3 $ + * $Revision: 1.4 $ * - * last change: $Author: sb $ $Date: 2001-10-17 14:35:30 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:16 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -59,354 +59,379 @@ * ************************************************************************/ -#ifndef INCLUDED_RTL_TEXTENC_TENCHELP_H +#include "sal/types.h" +#include "rtl/alloc.h" +#include "rtl/textcvt.h" + +#include "converter.h" #include "tenchelp.h" -#endif -#ifndef INCLUDED_RTL_TEXTENC_UNICHARS_H #include "unichars.h" -#endif -#ifndef _RTL_TEXTCVT_H -#include "rtl/textcvt.h" -#endif +static struct ImplUtf8ToUnicodeContext +{ + sal_uInt32 nUtf32; + int nShift; + sal_Bool bCheckBom; +}; -/* ----------------------------------------------------------------------- */ +static struct ImplUnicodeToUtf8Context +{ + sal_Unicode nHighSurrogate; // 0xFFFF: write BOM +}; -sal_Size ImplUTF8ToUnicode( const ImplTextConverterData* pData, void* pContext, - const sal_Char* pSrcBuf, sal_Size nSrcBytes, - sal_Unicode* pDestBuf, sal_Size nDestChars, - sal_uInt32 nFlags, sal_uInt32* pInfo, - sal_Size* pSrcCvtBytes ) +void * ImplCreateUtf8ToUnicodeContext(void) { - static sal_uInt8 const nExtraBytesFromUTF8Tab[16] = - { - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 5, 0, 0 - }; - static sal_uInt8 const nFirstByteMaskTab[3] = - { - 0x07, 0x03, 0x01 - }; - - sal_uInt8 nBytes; - sal_uInt8 nTempBytes; - sal_uChar c; - sal_uInt32 cConv; - sal_Unicode* pEndDestBuf; - const sal_Char* pEndSrcBuf; - - *pInfo = 0; - pEndDestBuf = pDestBuf+nDestChars; - pEndSrcBuf = pSrcBuf+nSrcBytes; - while ( pSrcBuf < pEndSrcBuf ) + void * p = rtl_allocateMemory(sizeof (struct ImplUtf8ToUnicodeContext)); + ImplResetUtf8ToUnicodeContext(p); + return p; +} + +void ImplResetUtf8ToUnicodeContext(void * pContext) +{ + if (pContext != NULL) { - if ( pDestBuf == pEndDestBuf ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; - break; - } + ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = -1; + ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = sal_True; + } +} - c = (sal_uChar)*pSrcBuf; +sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData, + void * pContext, sal_Char const * pSrcBuf, + sal_Size nSrcBytes, sal_Unicode * pDestBuf, + sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) +{ + // This function is very liberal with the UTF-8 input. Accepted are: + // - non-shortest forms (e.g., C0 41 instead of 41 to represent U+0041) + // - surrogates (e.g., ED A0 80 to represent U+D800) + // - encodings with up to six bytes (everything outside the range + // U+0000..10FFFF is considered "undefined") - /* 1 Byte */ - /* 0aaaaaaa (000000000aaaaaaa) */ - if ( !(c & 0x80) ) - { - *pDestBuf = (sal_Unicode)c; - pDestBuf++; - pSrcBuf++; - } - /* 2-3 Bytes */ - else if ( (c & 0xF0) != 0xF0 ) - { - /* 110aaaaa 10bbbbbb (00000aaaaabbbbbb) */ - if ( (c & 0xE0) == 0xC0 ) - { - nBytes = 2; - c &= 0x1F; /* 00001111; */ - } - /* 1110aaaa 10bbbbbb 10cccccc (aaaabbbbbbcccccc) */ - else if ( (c & 0xF0) == 0xE0 ) + sal_uInt32 nUtf32; + int nShift = -1; + sal_Bool bCheckBom = sal_True; + sal_uInt32 nInfo = 0; + sal_uChar const * pSrcBufPtr = (sal_uChar const *) pSrcBuf; + sal_uChar const * pSrcBufEnd = pSrcBufPtr + nSrcBytes; + sal_Unicode * pDestBufPtr = pDestBuf; + sal_Unicode * pDestBufEnd = pDestBufPtr + nDestChars; + + if (pContext != NULL) + { + nUtf32 = ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32; + nShift = ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift; + bCheckBom = ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom; + } + + while (pSrcBufPtr < pSrcBufEnd) + { + sal_Bool bUndefined = sal_False; + sal_Bool bConsume = sal_True; + sal_uInt32 nChar = *pSrcBufPtr++; + if (nShift < 0) + if (nChar <= 0x7F) { - nBytes = 3; - c &= 0x0F; /* 00001111; */ + nUtf32 = nChar; + goto transform; } - else + else if (nChar <= 0xBF) + goto bad_input; + else if (nChar <= 0xDF) { - *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) - { - if ( pDestBuf >= pEndDestBuf ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; - break; - } - *pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - } - pSrcBuf++; - continue; + nUtf32 = (nChar & 0x1F) << 6; + nShift = 0; } - - if ( pSrcBuf+nBytes > pEndSrcBuf ) + else if (nChar <= 0xEF) { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; - break; + nUtf32 = (nChar & 0x0F) << 12; + nShift = 6; } - - cConv = c; - pSrcBuf++; - if ( (*pSrcBuf & 0xC0) != 0x80 ) + else if (nChar <= 0xF7) { - *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) - { - pSrcBuf--; - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) - *pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - continue; + nUtf32 = (nChar & 0x07) << 18; + nShift = 12; } - else + else if (nChar <= 0xFB) { - c = (sal_uChar)*pSrcBuf; - cConv <<= 6; - cConv += c & 0x3F; /* 00111111 */ + nUtf32 = (nChar & 0x03) << 24; + nShift = 18; } - if ( nBytes == 3 ) + else if (nChar <= 0xFD) { - pSrcBuf++; - if ( (*pSrcBuf & 0xC0) != 0x80 ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) - { - pSrcBuf -= 2; - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) - *pDestBuf++ - = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - continue; - } - else - { - c = (sal_uChar)*pSrcBuf; - cConv <<= 6; - cConv += c & 0x3F; /* 00111111 */ - } + nUtf32 = (nChar & 0x01) << 30; + nShift = 24; } - *pDestBuf = (sal_Unicode)cConv; - pDestBuf++; - pSrcBuf++; + else + goto bad_input; + else if ((nChar & 0xC0) == 0x80) + { + nUtf32 |= (nChar & 0x3F) << nShift; + if (nShift == 0) + goto transform; + else + nShift -= 6; } - /* 4-6 Bytes */ else { - /* convert to ucs4 */ - nBytes = nExtraBytesFromUTF8Tab[c & 0x0F]; - if ( !nBytes ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) - { - if ( pDestBuf >= pEndDestBuf ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; - break; - } - *pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - } - pSrcBuf++; - continue; - } - else if ( pSrcBuf+nBytes+1 > pEndSrcBuf ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; - break; - } + // This byte is preceeded by a broken UTF-8 sequence; if this byte + // is neither in the range [0x80..0xBF] nor in the range + // [0xFE..0xFF], assume that this byte does not belong to that + // broken sequence, but instead starts a new, legal UTF-8 sequence: + bConsume = nChar >= 0xFE; + goto bad_input; + } + continue; - cConv = c & nFirstByteMaskTab[nBytes-3]; - nTempBytes = nBytes; - do - { - pSrcBuf++; - if ( (*pSrcBuf & 0xC0) != 0x80 ) - break; - c = (sal_uChar)*pSrcBuf; - cConv <<= 6; - cConv += c & 0x3F; /* 00111111 */ - nTempBytes--; - } - while ( nTempBytes ); - if ( nTempBytes ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_INVALID; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) == RTL_TEXTTOUNICODE_FLAGS_INVALID_ERROR ) + transform: + if (!bCheckBom || nUtf32 != 0xFEFF + || (nFlags & RTL_TEXTTOUNICODE_FLAGS_GLOBAL_SIGNATURE) == 0) + if (nUtf32 <= 0xFFFF) + if (pDestBufPtr != pDestBufEnd) + *pDestBufPtr++ = (sal_Unicode) nUtf32; + else + goto no_output; + else if (nUtf32 <= 0x10FFFF) + if (pDestBufEnd - pDestBufPtr >= 2) { - pSrcBuf -= nBytes-nTempBytes+1; - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; + *pDestBufPtr++ = (sal_Unicode) ImplGetHighSurrogate(nUtf32); + *pDestBufPtr++ = (sal_Unicode) ImplGetLowSurrogate(nUtf32); } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_INVALID_MASK) != RTL_TEXTTOUNICODE_FLAGS_INVALID_IGNORE ) - *pDestBuf++ = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - pSrcBuf++; - continue; - } + else + goto no_output; else { - pSrcBuf++; - if ( cConv > 0x10FFFF ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_MBUNDEFINED; - if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) == RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_ERROR ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_MASK) != RTL_TEXTTOUNICODE_FLAGS_MBUNDEFINED_IGNORE ) - *pDestBuf++ - = RTL_TEXTENC_UNICODE_REPLACEMENT_CHARACTER; - } - else if ( pDestBuf+2 > pEndDestBuf ) - { - *pInfo |= RTL_TEXTTOUNICODE_INFO_ERROR | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; - break; - } - else - { - *pDestBuf++ = (sal_Unicode) ImplGetHighSurrogate(cConv); - *pDestBuf++ = (sal_Unicode) ImplGetLowSurrogate(cConv); - } + bUndefined = sal_True; + goto bad_input; } + nShift = -1; + bCheckBom = sal_False; + continue; + + bad_input: + switch (ImplHandleBadInputMbTextToUnicodeConversion(bUndefined, nFlags, + &pDestBufPtr, + pDestBufEnd, + &nInfo)) + { + case IMPL_BAD_INPUT_STOP: + nShift = -1; + bCheckBom = sal_False; + if (!bConsume) + --pSrcBufPtr; + break; + + case IMPL_BAD_INPUT_CONTINUE: + nShift = -1; + bCheckBom = sal_False; + if (!bConsume) + --pSrcBufPtr; + continue; + + case IMPL_BAD_INPUT_NO_OUTPUT: + goto no_output; } + break; + + no_output: + --pSrcBufPtr; + nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; + break; } - *pSrcCvtBytes = nSrcBytes - (pEndSrcBuf-pSrcBuf); - return (nDestChars - (pEndDestBuf-pDestBuf)); + if (nShift >= 0 + && (nInfo & (RTL_TEXTTOUNICODE_INFO_ERROR + | RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL)) + == 0) + if ((nFlags & RTL_TEXTTOUNICODE_FLAGS_FLUSH) == 0) + nInfo |= RTL_TEXTTOUNICODE_INFO_SRCBUFFERTOSMALL; + else + switch (ImplHandleBadInputMbTextToUnicodeConversion(sal_False, + nFlags, + &pDestBufPtr, + pDestBufEnd, + &nInfo)) + { + case IMPL_BAD_INPUT_STOP: + case IMPL_BAD_INPUT_CONTINUE: + nShift = -1; + bCheckBom = sal_False; + break; + + case IMPL_BAD_INPUT_NO_OUTPUT: + nInfo |= RTL_TEXTTOUNICODE_INFO_DESTBUFFERTOSMALL; + break; + } + + if (pContext != NULL) + { + ((struct ImplUtf8ToUnicodeContext *) pContext)->nUtf32 = nUtf32; + ((struct ImplUtf8ToUnicodeContext *) pContext)->nShift = nShift; + ((struct ImplUtf8ToUnicodeContext *) pContext)->bCheckBom = bCheckBom; + } + if (pInfo != NULL) + *pInfo = nInfo; + if (pSrcCvtBytes != NULL) + *pSrcCvtBytes = (sal_Char const *) pSrcBufPtr - pSrcBuf; + return pDestBufPtr - pDestBuf; } -/* ----------------------------------------------------------------------- */ +void * ImplCreateUnicodeToUtf8Context(void) +{ + void * p = rtl_allocateMemory(sizeof (struct ImplUnicodeToUtf8Context)); + ImplResetUnicodeToUtf8Context(p); + return p; +} + +void ImplResetUnicodeToUtf8Context(void * pContext) +{ + if (pContext != NULL) + ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate = 0xFFFF; +} -sal_Size ImplUnicodeToUTF8( const ImplTextConverterData* pData, void* pContext, - const sal_Unicode* pSrcBuf, sal_Size nSrcChars, - sal_Char* pDestBuf, sal_Size nDestBytes, - sal_uInt32 nFlags, sal_uInt32* pInfo, - sal_Size* pSrcCvtChars ) +sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData, + void * pContext, sal_Unicode const * pSrcBuf, + sal_Size nSrcChars, sal_Char * pDestBuf, + sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) { - static sal_uInt8 const nFirstByteMarkTab[6] = + sal_Unicode nHighSurrogate = 0xFFFF; + sal_uInt32 nInfo = 0; + sal_Unicode const * pSrcBufPtr = pSrcBuf; + sal_Unicode const * pSrcBufEnd = pSrcBufPtr + nSrcChars; + sal_Char * pDestBufPtr = pDestBuf; + sal_Char * pDestBufEnd = pDestBufPtr + nDestBytes; + + if (pContext != NULL) + nHighSurrogate + = ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate; + + if (nHighSurrogate == 0xFFFF) { - 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC - }; - - sal_Unicode c; - sal_Unicode c2; - sal_uInt32 nUCS4Char; - sal_uInt8 nBytes; - sal_Char* pTempDestBuf; - sal_Char* pEndDestBuf; - const sal_Unicode* pEndSrcBuf; - - *pInfo = 0; - pEndDestBuf = pDestBuf+nDestBytes; - pEndSrcBuf = pSrcBuf+nSrcChars; - while ( pSrcBuf < pEndSrcBuf ) + if ((nFlags & RTL_UNICODETOTEXT_FLAGS_GLOBAL_SIGNATURE) != 0) + if (pDestBufEnd - pDestBufPtr >= 3) + { + // Write BOM (U+FEFF) as UTF-8: + *pDestBufPtr++ = (sal_Char) 0xEF; + *pDestBufPtr++ = (sal_Char) 0xBB; + *pDestBufPtr++ = (sal_Char) 0xBF; + } + else + { + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + goto done; + } + nHighSurrogate = 0; + } + + while (pSrcBufPtr < pSrcBufEnd) { - c = *pSrcBuf; - if ( c < 0x80 ) + sal_uInt32 nChar = *pSrcBufPtr++; + if (nHighSurrogate == 0) { - if ( pDestBuf == pEndDestBuf ) + if (ImplIsHighSurrogate(nChar)) { - *pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; - break; + nHighSurrogate = (sal_Unicode) nChar; + continue; } - - *pDestBuf = (sal_Char)(sal_uChar)c; - pDestBuf++; - pSrcBuf++; } + else if (ImplIsLowSurrogate(nChar)) + nChar = ImplCombineSurrogates(nHighSurrogate, nChar); else - { - nUCS4Char = c; - if ( nUCS4Char < 0x800 ) - nBytes = 2; - else - { - if (ImplIsHighSurrogate(c)) - { - if ( pSrcBuf == pEndSrcBuf ) - { - *pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; - break; - } - - c2 = *(pSrcBuf+1); - if (ImplIsLowSurrogate(c2)) - { - nUCS4Char = ImplCombineSurrogates(c, c2); - pSrcBuf++; - } - else - { - *pInfo |= RTL_UNICODETOTEXT_INFO_INVALID; - if ( (nFlags & RTL_UNICODETOTEXT_FLAGS_INVALID_MASK) == RTL_UNICODETOTEXT_FLAGS_INVALID_ERROR ) - { - *pInfo |= RTL_UNICODETOTEXT_INFO_ERROR; - break; - } - else if ( (nFlags & RTL_UNICODETOTEXT_FLAGS_INVALID_MASK) == RTL_UNICODETOTEXT_FLAGS_INVALID_IGNORE ) - { - pSrcBuf++; - continue; - } - /* in UTF8 we save the original code. I think */ - /* this is better than the default char, */ - /* because it is a unicode format. */ - } - } + goto bad_input; - if ( nUCS4Char < 0x10000 ) - nBytes = 3; - else if ( nUCS4Char < 0x200000 ) - nBytes = 4; - else if ( nUCS4Char < 0x4000000 ) - nBytes = 5; - else - nBytes = 6; - } + if (ImplIsLowSurrogate(nChar) || ImplIsNoncharacter(nChar)) + goto bad_input; - if ( pDestBuf+nBytes > pEndDestBuf ) + if (nChar <= 0x7F) + if (pDestBufPtr != pDestBufEnd) + *pDestBufPtr++ = (sal_Char) nChar; + else + goto no_output; + else if (nChar <= 0x7FF) + if (pDestBufEnd - pDestBufPtr >= 2) { - *pInfo |= RTL_UNICODETOTEXT_INFO_ERROR | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; - break; + *pDestBufPtr++ = (sal_Char) (0xC0 | (nChar >> 6)); + *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); } - pDestBuf += nBytes; - pTempDestBuf = pDestBuf; - switch ( nBytes ) /* no breaks, only jump table */ + else + goto no_output; + else if (nChar <= 0xFFFF) + if (pDestBufEnd - pDestBufPtr >= 3) { - case 6: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6; - case 5: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6; - case 4: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6; - case 3: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6; - case 2: pTempDestBuf--; *pTempDestBuf = (sal_Char)((((sal_uChar)nUCS4Char) | 0x80) & 0xBF); nUCS4Char >>= 6; - }; - pTempDestBuf--; - *pTempDestBuf = (sal_Char)(((sal_uChar)nUCS4Char) | nFirstByteMarkTab[nBytes-1]); - pSrcBuf++; + *pDestBufPtr++ = (sal_Char) (0xE0 | (nChar >> 12)); + *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); + *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); + } + else + goto no_output; + else if (pDestBufEnd - pDestBufPtr >= 4) + { + *pDestBufPtr++ = (sal_Char) (0xF0 | (nChar >> 18)); + *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 12) & 0x3F)); + *pDestBufPtr++ = (sal_Char) (0x80 | ((nChar >> 6) & 0x3F)); + *pDestBufPtr++ = (sal_Char) (0x80 | (nChar & 0x3F)); + } + else + goto no_output; + nHighSurrogate = 0; + continue; + + bad_input: + switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, nFlags, + &pDestBufPtr, + pDestBufEnd, &nInfo, + NULL, 0, NULL)) + { + case IMPL_BAD_INPUT_STOP: + nHighSurrogate = 0; + break; + + case IMPL_BAD_INPUT_CONTINUE: + nHighSurrogate = 0; + continue; + + case IMPL_BAD_INPUT_NO_OUTPUT: + goto no_output; } + break; + + no_output: + --pSrcBufPtr; + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; } - *pSrcCvtChars = nSrcChars - (pEndSrcBuf-pSrcBuf); - return (nDestBytes - (pEndDestBuf-pDestBuf)); + if (nHighSurrogate != 0 + && (nInfo & (RTL_UNICODETOTEXT_INFO_ERROR + | RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL)) + == 0) + if ((nFlags & RTL_UNICODETOTEXT_FLAGS_FLUSH) != 0) + nInfo |= RTL_UNICODETOTEXT_INFO_SRCBUFFERTOSMALL; + else + switch (ImplHandleBadInputUnicodeToTextConversion(sal_False, 0, + nFlags, + &pDestBufPtr, + pDestBufEnd, + &nInfo, NULL, 0, + NULL)) + { + case IMPL_BAD_INPUT_STOP: + case IMPL_BAD_INPUT_CONTINUE: + nHighSurrogate = 0; + break; + + case IMPL_BAD_INPUT_NO_OUTPUT: + nInfo |= RTL_UNICODETOTEXT_INFO_DESTBUFFERTOSMALL; + break; + } + + done: + if (pContext != NULL) + ((struct ImplUnicodeToUtf8Context *) pContext)->nHighSurrogate + = nHighSurrogate; + if (pInfo != NULL) + *pInfo = nInfo; + if (pSrcCvtChars != NULL) + *pSrcCvtChars = pSrcBufPtr - pSrcBuf; + return pDestBufPtr - pDestBuf; } diff --git a/sal/textenc/tenchelp.h b/sal/textenc/tenchelp.h index 1f997859a..fed24ac6f 100644 --- a/sal/textenc/tenchelp.h +++ b/sal/textenc/tenchelp.h @@ -2,9 +2,9 @@ * * $RCSfile: tenchelp.h,v $ * - * $Revision: 1.7 $ + * $Revision: 1.8 $ * - * last change: $Author: sb $ $Date: 2002-10-04 13:37:07 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:16 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -215,6 +215,8 @@ typedef struct { const ImplDBCSToUniLeadTab* mpToUniLeadTab; const ImplUniToDBCSHighTab* mpToDBCSHighTab; + sal_uChar mnLeadStart; + sal_uChar mnLeadEnd; sal_uChar mnTrailStart; sal_uChar mnTrailEnd; const ImplDBCSEUDCData* mpEUDCTab; @@ -307,16 +309,23 @@ sal_Size ImplUnicodeToUTF7( const ImplTextConverterData* pData, void* pContext, sal_Char* pDestBuf, sal_Size nDestBytes, sal_uInt32 nFlags, sal_uInt32* pInfo, sal_Size* pSrcCvtChars ); -sal_Size ImplUTF8ToUnicode( const ImplTextConverterData* pData, void* pContext, - const sal_Char* pSrcBuf, sal_Size nSrcBytes, - sal_Unicode* pDestBuf, sal_Size nDestChars, - sal_uInt32 nFlags, sal_uInt32* pInfo, - sal_Size* pSrcCvtBytes ); -sal_Size ImplUnicodeToUTF8( const ImplTextConverterData* pData, void* pContext, - const sal_Unicode* pSrcBuf, sal_Size nSrcChars, - sal_Char* pDestBuf, sal_Size nDestBytes, - sal_uInt32 nFlags, sal_uInt32* pInfo, - sal_Size* pSrcCvtChars ); + +void * ImplCreateUtf8ToUnicodeContext(void) SAL_THROW_EXTERN_C(); +void ImplResetUtf8ToUnicodeContext(void * pContext) SAL_THROW_EXTERN_C(); +sal_Size ImplConvertUtf8ToUnicode(ImplTextConverterData const * pData, + void * pContext, sal_Char const * pSrcBuf, + sal_Size nSrcBytes, sal_Unicode * pDestBuf, + sal_Size nDestChars, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size * pSrcCvtBytes) + SAL_THROW_EXTERN_C(); +void * ImplCreateUnicodeToUtf8Context(void) SAL_THROW_EXTERN_C(); +void ImplResetUnicodeToUtf8Context(void * pContext) SAL_THROW_EXTERN_C(); +sal_Size ImplConvertUnicodeToUtf8(ImplTextConverterData const * pData, + void * pContext, sal_Unicode const * pSrcBuf, + sal_Size nSrcChars, sal_Char * pDestBuf, + sal_Size nDestBytes, sal_uInt32 nFlags, + sal_uInt32 * pInfo, sal_Size* pSrcCvtChars) + SAL_THROW_EXTERN_C(); #if defined __cplusplus } diff --git a/sal/textenc/tencinfo.c b/sal/textenc/tencinfo.c index 4fa6dc847..2e25744cc 100644 --- a/sal/textenc/tencinfo.c +++ b/sal/textenc/tencinfo.c @@ -2,9 +2,9 @@ * * $RCSfile: tencinfo.c,v $ * - * $Revision: 1.18 $ + * $Revision: 1.19 $ * - * last change: $Author: sb $ $Date: 2002-05-29 11:41:24 $ + * last change: $Author: hr $ $Date: 2003-03-26 16:47:17 $ * * The Contents of this file are made available subject to the terms of * either of the following licenses @@ -86,7 +86,7 @@ sal_Bool SAL_CALL rtl_isOctetTextEncoding(rtl_TextEncoding nEncoding) { return nEncoding > RTL_TEXTENCODING_DONTKNOW - && nEncoding <= RTL_TEXTENCODING_TIS_620 /* always update this! */ + && nEncoding <= RTL_TEXTENCODING_KOI8_U /* always update this! */ && nEncoding != 9; /* RTL_TEXTENCODING_SYSTEM */ } @@ -543,6 +543,7 @@ rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromUnixCharset( const sal_Char* pU static ImplStrCharsetDef const aUnixCharsetKOI8Tab[] = { { "r", RTL_TEXTENCODING_KOI8_R }, + { "u", RTL_TEXTENCODING_KOI8_U }, { NULL, RTL_TEXTENCODING_DONTKNOW } }; @@ -942,6 +943,8 @@ rtl_TextEncoding SAL_CALL rtl_getTextEncodingFromMimeCharset( const sal_Char* pM { "windows874", RTL_TEXTENCODING_MS_874 }, /* This is no official MIME character set name, but it might be in use in Thailand. */ + { "koi8u", RTL_TEXTENCODING_KOI8_U }, + { "cpis", RTL_TEXTENCODING_IBM_861 }, { NULL, RTL_TEXTENCODING_DONTKNOW } }; |