summaryrefslogtreecommitdiff
path: root/charsetalias.c
diff options
context:
space:
mode:
authorLuo Jinghua <sunmoon1997@gmail.com>2010-01-07 16:27:41 +0800
committerLuo Jinghua <sunmoon1997@gmail.com>2010-01-07 16:27:41 +0800
commit16001b547a8f27aceb1691efb3e07ab9c3af0030 (patch)
tree9f804667053a6a4ea3e10aac11ae6c9f2cb5af87 /charsetalias.c
Initial commit of uniconv
Diffstat (limited to 'charsetalias.c')
-rw-r--r--charsetalias.c499
1 files changed, 499 insertions, 0 deletions
diff --git a/charsetalias.c b/charsetalias.c
new file mode 100644
index 0000000..edfe863
--- /dev/null
+++ b/charsetalias.c
@@ -0,0 +1,499 @@
+#include "charsetalias.h"
+
+#include <ctype.h>
+#include <string.h>
+
+#if defined(WIN32) || defined(_WIN32)
+#define strcasecmp(x, y) stricmp(x, y)
+#endif
+
+const char*
+get_canonical_charset(const char *charset)
+{
+ static const char *aliases[] = {
+ "646" , "ascii",
+ "ansi_x3.4_1968" , "ascii",
+ "ansi_x3_4_1968" , "ascii", /* some email headers use this non-standard name */
+ "ansi_x3.4_1986" , "ascii",
+ "cp367" , "ascii",
+ "csascii" , "ascii",
+ "ibm367" , "ascii",
+ "iso646_us" , "ascii",
+ "iso_646.irv_1991" , "ascii",
+ "iso_ir_6" , "ascii",
+ "us" , "ascii",
+ "us_ascii" , "ascii",
+
+ /* big5 codec */
+ "big5_tw" , "big5",
+ "csbig5" , "big5",
+
+ /* big5hkscs codec */
+ "big5_hkscs" , "big5hkscs",
+ "hkscs" , "big5hkscs",
+
+ /* cp037 codec */
+ "037" , "cp037",
+ "csibm037" , "cp037",
+ "ebcdic_cp_ca" , "cp037",
+ "ebcdic_cp_nl" , "cp037",
+ "ebcdic_cp_us" , "cp037",
+ "ebcdic_cp_wt" , "cp037",
+ "ibm037" , "cp037",
+ "ibm039" , "cp037",
+
+ /* cp1026 codec */
+ "1026" , "cp1026",
+ "csibm1026" , "cp1026",
+ "ibm1026" , "cp1026",
+
+ /* cp1140 codec */
+ "1140" , "cp1140",
+ "ibm1140" , "cp1140",
+
+ /* cp1250 codec */
+ "1250" , "cp1250",
+ "windows_1250" , "cp1250",
+
+ /* cp1251 codec */
+ "1251" , "cp1251",
+ "windows_1251" , "cp1251",
+
+ /* cp1252 codec */
+ "1252" , "cp1252",
+ "windows_1252" , "cp1252",
+
+ /* cp1253 codec */
+ "1253" , "cp1253",
+ "windows_1253" , "cp1253",
+
+ /* cp1254 codec */
+ "1254" , "cp1254",
+ "windows_1254" , "cp1254",
+
+ /* cp1255 codec */
+ "1255" , "cp1255",
+ "windows_1255" , "cp1255",
+
+ /* cp1256 codec */
+ "1256" , "cp1256",
+ "windows_1256" , "cp1256",
+
+ /* cp1257 codec */
+ "1257" , "cp1257",
+ "windows_1257" , "cp1257",
+
+ /* cp1258 codec */
+ "1258" , "cp1258",
+ "windows_1258" , "cp1258",
+
+ /* cp424 codec */
+ "424" , "cp424",
+ "csibm424" , "cp424",
+ "ebcdic_cp_he" , "cp424",
+ "ibm424" , "cp424",
+
+ /* cp437 codec */
+ "437" , "cp437",
+ "cspc8codepage437" , "cp437",
+ "ibm437" , "cp437",
+
+ /* cp500 codec */
+ "500" , "cp500",
+ "csibm500" , "cp500",
+ "ebcdic_cp_be" , "cp500",
+ "ebcdic_cp_ch" , "cp500",
+ "ibm500" , "cp500",
+
+ /* cp775 codec */
+ "775" , "cp775",
+ "cspc775baltic" , "cp775",
+ "ibm775" , "cp775",
+
+ /* cp850 codec */
+ "850" , "cp850",
+ "cspc850multilingual" , "cp850",
+ "ibm850" , "cp850",
+
+ /* cp852 codec */
+ "852" , "cp852",
+ "cspcp852" , "cp852",
+ "ibm852" , "cp852",
+
+ /* cp855 codec */
+ "855" , "cp855",
+ "csibm855" , "cp855",
+ "ibm855" , "cp855",
+
+ /* cp857 codec */
+ "857" , "cp857",
+ "csibm857" , "cp857",
+ "ibm857" , "cp857",
+
+ /* cp860 codec */
+ "860" , "cp860",
+ "csibm860" , "cp860",
+ "ibm860" , "cp860",
+
+ /* cp861 codec */
+ "861" , "cp861",
+ "cp_is" , "cp861",
+ "csibm861" , "cp861",
+ "ibm861" , "cp861",
+
+ /* cp862 codec */
+ "862" , "cp862",
+ "cspc862latinhebrew" , "cp862",
+ "ibm862" , "cp862",
+
+ /* cp863 codec */
+ "863" , "cp863",
+ "csibm863" , "cp863",
+ "ibm863" , "cp863",
+
+ /* cp864 codec */
+ "864" , "cp864",
+ "csibm864" , "cp864",
+ "ibm864" , "cp864",
+
+ /* cp865 codec */
+ "865" , "cp865",
+ "csibm865" , "cp865",
+ "ibm865" , "cp865",
+
+ /* cp866 codec */
+ "866" , "cp866",
+ "csibm866" , "cp866",
+ "ibm866" , "cp866",
+
+ /* cp869 codec */
+ "869" , "cp869",
+ "cp_gr" , "cp869",
+ "csibm869" , "cp869",
+ "ibm869" , "cp869",
+
+ /* cp932 codec */
+ "932" , "cp932",
+ "ms932" , "cp932",
+ "mskanji" , "cp932",
+ "ms_kanji" , "cp932",
+
+ /* cp949 codec */
+ "949" , "cp949",
+ "ms949" , "cp949",
+ "uhc" , "cp949",
+
+ /* cp950 codec */
+ "950" , "cp950",
+ "ms950" , "cp950",
+
+ /* euc_jis_2004 codec */
+ "jisx0213" , "euc_jis_2004",
+ "eucjis2004" , "euc_jis_2004",
+ "euc_jis2004" , "euc_jis_2004",
+
+ /* euc_jisx0213 codec */
+ "eucjisx0213" , "euc_jisx0213",
+
+ /* euc_jp codec */
+ "eucjp" , "euc_jp",
+ "ujis" , "euc_jp",
+ "u_jis" , "euc_jp",
+
+ /* euc_kr codec */
+ "euckr" , "euc_kr",
+ "korean" , "euc_kr",
+ "ksc5601" , "euc_kr",
+ "ks_c_5601" , "euc_kr",
+ "ks_c_5601_1987" , "euc_kr",
+ "ksx1001" , "euc_kr",
+ "ks_x_1001" , "euc_kr",
+
+ /* gb18030 codec */
+ "gb18030_2000" , "gb18030",
+
+ /* gb2312 codec */
+ "chinese" , "gb2312",
+ "csiso58gb231280" , "gb2312",
+ "euc_cn" , "gb2312",
+ "euccn" , "gb2312",
+ "eucgb2312_cn" , "gb2312",
+ "gb2312_1980" , "gb2312",
+ "gb2312_80" , "gb2312",
+ "iso_ir_58" , "gb2312",
+
+ /* gbk codec */
+ "936" , "gbk",
+ "cp936" , "gbk",
+ "ms936" , "gbk",
+
+ /* hp_roman8 codec */
+ "roman8" , "hp_roman8",
+ "r8" , "hp_roman8",
+ "csHPRoman8" , "hp_roman8",
+
+ /* hz codec */
+ "hzgb" , "hz",
+ "hz_gb" , "hz",
+ "hz_gb_2312" , "hz",
+
+ /* iso2022_jp codec */
+ "csiso2022jp" , "iso2022_jp",
+ "iso2022jp" , "iso2022_jp",
+ "iso_2022_jp" , "iso2022_jp",
+
+ /* iso2022_jp_1 codec */
+ "iso2022jp_1" , "iso2022_jp_1",
+ "iso_2022_jp_1" , "iso2022_jp_1",
+
+ /* iso2022_jp_2 codec */
+ "iso2022jp_2" , "iso2022_jp_2",
+ "iso_2022_jp_2" , "iso2022_jp_2",
+
+ /* iso2022_jp_2004 codec */
+ "iso_2022_jp_2004" , "iso2022_jp_2004",
+ "iso2022jp_2004" , "iso2022_jp_2004",
+
+ /* iso2022_jp_3 codec */
+ "iso2022jp_3" , "iso2022_jp_3",
+ "iso_2022_jp_3" , "iso2022_jp_3",
+
+ /* iso2022_jp_ext codec */
+ "iso2022jp_ext" , "iso2022_jp_ext",
+ "iso_2022_jp_ext" , "iso2022_jp_ext",
+
+ /* iso2022_kr codec */
+ "csiso2022kr" , "iso2022_kr",
+ "iso2022kr" , "iso2022_kr",
+ "iso_2022_kr" , "iso2022_kr",
+
+ /* iso8859_10 codec */
+ "csisolatin6" , "iso8859_10",
+ "iso_8859_10" , "iso8859_10",
+ "iso_8859_10_1992" , "iso8859_10",
+ "iso_ir_157" , "iso8859_10",
+ "l6" , "iso8859_10",
+ "latin6" , "iso8859_10",
+
+ /* iso8859_11 codec */
+ "thai" , "iso8859_11",
+ "iso_8859_11" , "iso8859_11",
+ "iso_8859_11_2001" , "iso8859_11",
+
+ /* iso8859_13 codec */
+ "iso_8859_13" , "iso8859_13",
+ "l7" , "iso8859_13",
+ "latin7" , "iso8859_13",
+
+ /* iso8859_14 codec */
+ "iso_8859_14" , "iso8859_14",
+ "iso_8859_14_1998" , "iso8859_14",
+ "iso_celtic" , "iso8859_14",
+ "iso_ir_199" , "iso8859_14",
+ "l8" , "iso8859_14",
+ "latin8" , "iso8859_14",
+
+ /* iso8859_15 codec */
+ "iso_8859_15" , "iso8859_15",
+ "l9" , "iso8859_15",
+ "latin9" , "iso8859_15",
+
+ /* iso8859_16 codec */
+ "iso_8859_16" , "iso8859_16",
+ "iso_8859_16_2001" , "iso8859_16",
+ "iso_ir_226" , "iso8859_16",
+ "l10" , "iso8859_16",
+ "latin10" , "iso8859_16",
+
+ /* iso8859_2 codec */
+ "csisolatin2" , "iso8859_2",
+ "iso_8859_2" , "iso8859_2",
+ "iso_8859_2_1987" , "iso8859_2",
+ "iso_ir_101" , "iso8859_2",
+ "l2" , "iso8859_2",
+ "latin2" , "iso8859_2",
+
+ /* iso8859_3 codec */
+ "csisolatin3" , "iso8859_3",
+ "iso_8859_3" , "iso8859_3",
+ "iso_8859_3_1988" , "iso8859_3",
+ "iso_ir_109" , "iso8859_3",
+ "l3" , "iso8859_3",
+ "latin3" , "iso8859_3",
+
+ /* iso8859_4 codec */
+ "csisolatin4" , "iso8859_4",
+ "iso_8859_4" , "iso8859_4",
+ "iso_8859_4_1988" , "iso8859_4",
+ "iso_ir_110" , "iso8859_4",
+ "l4" , "iso8859_4",
+ "latin4" , "iso8859_4",
+
+ /* iso8859_5 codec */
+ "csisolatincyrillic" , "iso8859_5",
+ "cyrillic" , "iso8859_5",
+ "iso_8859_5" , "iso8859_5",
+ "iso_8859_5_1988" , "iso8859_5",
+ "iso_ir_144" , "iso8859_5",
+
+ /* iso8859_6 codec */
+ "arabic" , "iso8859_6",
+ "asmo_708" , "iso8859_6",
+ "csisolatinarabic" , "iso8859_6",
+ "ecma_114" , "iso8859_6",
+ "iso_8859_6" , "iso8859_6",
+ "iso_8859_6_1987" , "iso8859_6",
+ "iso_ir_127" , "iso8859_6",
+
+ /* iso8859_7 codec */
+ "csisolatingreek" , "iso8859_7",
+ "ecma_118" , "iso8859_7",
+ "elot_928" , "iso8859_7",
+ "greek" , "iso8859_7",
+ "greek8" , "iso8859_7",
+ "iso_8859_7" , "iso8859_7",
+ "iso_8859_7_1987" , "iso8859_7",
+ "iso_ir_126" , "iso8859_7",
+
+ /* iso8859_8 codec */
+ "csisolatinhebrew" , "iso8859_8",
+ "hebrew" , "iso8859_8",
+ "iso_8859_8" , "iso8859_8",
+ "iso_8859_8_1988" , "iso8859_8",
+ "iso_ir_138" , "iso8859_8",
+
+ /* iso8859_9 codec */
+ "csisolatin5" , "iso8859_9",
+ "iso_8859_9" , "iso8859_9",
+ "iso_8859_9_1989" , "iso8859_9",
+ "iso_ir_148" , "iso8859_9",
+ "l5" , "iso8859_9",
+ "latin5" , "iso8859_9",
+
+ /* johab codec */
+ "cp1361" , "johab",
+ "ms1361" , "johab",
+
+ /* koi8_r codec */
+ "cskoi8r" , "koi8_r",
+
+ /* latin_1 code */
+ /* Note that the latin_1 codec is implemented internally in C and a */
+ /* lot faster than the charmap codec iso8859_1 which uses the same */
+ /* encoding. This is why we discourage the use of the iso8859_1 */
+ /* codec and alias it to latin_1 instead. */
+
+ "8859" , "iso8859_1",
+ "cp819" , "iso8859_1",
+ "csisolatin1" , "iso8859_1",
+ "ibm819" , "iso8859_1",
+ "iso8859" , "iso8859_1",
+ "iso8859_1" , "iso8859_1",
+ "iso_8859_1" , "iso8859_1",
+ "iso_8859_1_1987" , "iso8859_1",
+ "iso_ir_100" , "iso8859_1",
+ "l1" , "iso8859_1",
+ "latin" , "iso8859_1",
+ "latin1" , "iso8859_1",
+
+ /* mac_cyrillic codec */
+ "maccyrillic" , "mac_cyrillic",
+
+ /* mac_greek codec */
+ "macgreek" , "mac_greek",
+
+ /* mac_iceland codec */
+ "maciceland" , "mac_iceland",
+
+ /* mac_latin2 codec */
+ "maccentraleurope" , "mac_latin2",
+ "maclatin2" , "mac_latin2",
+
+ /* mac_roman codec */
+ "macroman" , "mac_roman",
+
+ /* mac_turkish codec */
+ "macturkish" , "mac_turkish",
+
+ /* ptcp154 codec */
+ "csptcp154" , "ptcp154",
+ "pt154" , "ptcp154",
+ "cp154" , "ptcp154",
+ "cyrillic-asian" , "ptcp154",
+
+ /* quopri_codec codec */
+ "quopri" , "quopri_codec",
+ "quoted_printable" , "quopri_codec",
+ "quotedprintable" , "quopri_codec",
+
+ /* shift_jis codec */
+ "csshiftjis" , "shift_jis",
+ "shiftjis" , "shift_jis",
+ "sjis" , "shift_jis",
+ "s_jis" , "shift_jis",
+
+ /* shift_jis_2004 codec */
+ "shiftjis2004" , "shift_jis_2004",
+ "sjis_2004" , "shift_jis_2004",
+ "s_jis_2004" , "shift_jis_2004",
+
+ /* shift_jisx0213 codec */
+ "shiftjisx0213" , "shift_jisx0213",
+ "sjisx0213" , "shift_jisx0213",
+ "s_jisx0213" , "shift_jisx0213",
+
+ /* tactis codec */
+ "tis260" , "tactis",
+
+ /* tis_620 codec */
+ "tis620" , "tis_620",
+ "tis_620_0" , "tis_620",
+ "tis_620_2529_0" , "tis_620",
+ "tis_620_2529_1" , "tis_620",
+ "iso_ir_166" , "tis_620",
+
+ /* utf_16 codec */
+ "u16" , "utf_16",
+ "utf16" , "utf_16",
+
+ /* utf_16_be codec */
+ "unicodebigunmarked" , "utf_16_be",
+ "utf_16be" , "utf_16_be",
+
+ /* utf_16_le codec */
+ "unicodelittleunmarked" , "utf_16_le",
+ "utf_16le" , "utf_16_le",
+
+ /* utf_32 codec */
+ "u32" , "utf_32",
+ "utf32" , "utf_32",
+ "unicode" , "utf_32",
+
+ /* utf_32_be codec */
+ "utf_32be" , "utf_32_be",
+
+ /* utf_32_le codec */
+ "utf_32le" , "utf_32_le",
+
+ /* utf_7 codec */
+ "u7" , "utf_7",
+ "utf7" , "utf_7",
+ "unicode_1_1_utf_7" , "utf_7",
+
+ /* utf_8 codec */
+ "u8" , "utf_8",
+ "utf" , "utf_8",
+ "utf8" , "utf_8",
+ "utf8_ucs2" , "utf_8",
+ "utf8_ucs4" , "utf_8",
+ NULL, NULL
+ };
+ size_t i = 0;
+
+ for (i = 0; aliases[i]; i += 2)
+ if (!strcasecmp(charset, aliases[i]))
+ return aliases[i + 1];
+
+ return charset;
+}
+