diff options
author | Jehan <jehan@girinstud.io> | 2021-03-20 22:12:45 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:24:53 +0100 |
commit | b725c0b2ff709b80d759b02906b2505a3455da20 (patch) | |
tree | 971aa30e8a8b3d05bf48caa7a497d0d32f7ca2a3 | |
parent | c782177a8d8fc895f8dc12b85ae428df1b7d3cb3 (diff) |
src: new nsCJKDetector specifically Chinese/Japanese/Korean recognition.
I was pondering improving the logics of the LanguageModel contents, in
order to better handle language with a huge number of characters (far
too much to keep a full frequent list while keeping reasonable memory
consumption and speed).
But then I realize that this happens for languages which have anyway
their own set of characters.
For instance, modern Korean is near full hangul. Of course, we can find
some Chinese characters here and there, but nothing which should really
break confidence if we base it on the hangul ratio. Of course if some
day we want to go further and detect older Korean, we will have to
improve the logics a bit with some statistics, though I wonder if
limiting ourselves to character frequency is not enough here (sequence
frequency is maybe a bit overboard). To be tested.
In any case, this new class gives much more relevant confidence on
Korean texts, compared to the statistics data we previously generated.
For Japanese, it is a mix of kana and Chinese characters. A modern full
text cannot exist without a lot of kanas (probably only old text or very
short texts, such as titles, could have only Chinese characters). We
would still want to add a bit of statistics to differentiate correctly a
Japanese text with a lot of Chinese characters in it and a Chinese
text which quotes a bit of Japanese phrases. It will have to be
improved, but for now it works fairly ok.
A last case where we would want to play with statistics might be if we
want to differentiate between regional variants. For instance,
Simplified Chinese, Taiwan or Hong Kong Chineseā¦ More to experiment
later on. It's already a first good step for UTF-8 support with
language!
-rw-r--r-- | src/CMakeLists.txt | 1 | ||||
-rw-r--r-- | src/nsCJKDetector.cpp | 239 | ||||
-rw-r--r-- | src/nsCJKDetector.h | 70 | ||||
-rw-r--r-- | src/nsMBCSGroupProber.cpp | 4 |
4 files changed, 313 insertions, 1 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index bb68b4c..c79532e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -56,6 +56,7 @@ set( nsSJISProber.cpp nsUTF8Prober.cpp nsLanguageDetector.cpp + nsCJKDetector.cpp nsLatin1Prober.cpp nsUniversalDetector.cpp uchardet.cpp diff --git a/src/nsCJKDetector.cpp b/src/nsCJKDetector.cpp new file mode 100644 index 0000000..7ee7f31 --- /dev/null +++ b/src/nsCJKDetector.cpp @@ -0,0 +1,239 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. This + * file was later added by Jehan in 2021 to add language detection. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 the + * Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Jehan <zemarmot.net> (2021) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +#define CJK_ENOUGH_CHAR_THRESHOLD 4096 +#define CJK_POSITIVE_SHORTCUT_THRESHOLD (float)0.95 +#define CJK_NEGATIVE_SHORTCUT_THRESHOLD (float)0.05 + +#include "nsCJKDetector.h" + +void nsCJKDetector::Reset(void) +{ + nsLanguageDetector::Reset(); + + mHangulChar = 0; + mKanaChar = 0; + mHanziChar = 0; +} + +nsDetectState nsCJKDetector::HandleData(const int* codePoints, PRUint32 cpLen) +{ + for (PRUint32 i = 0; i < cpLen; i++) + { + mTotalChar++; + + if ((codePoints[i] >= 0xAC00 && codePoints[i] <= 0xD7A3) || + (codePoints[i] >= 0x1100 && codePoints[i] <= 0x11FF) || + (codePoints[i] >= 0x3130 && codePoints[i] <= 0x318F) || + (codePoints[i] >= 0xA960 && codePoints[i] <= 0xA97F) || + (codePoints[i] >= 0xD7B0 && codePoints[i] <= 0xD7FF)) + { + mHangulChar++; + } + else if ((codePoints[i] >= 0x3041 && codePoints[i] <= 0x309F) || + (codePoints[i] >= 0x30A0 && codePoints[i] <= 0x30FF)) + { + mKanaChar++; + } + else if (codePoints[i] >= 0x4E00 && codePoints[i] <= 0x9FBF) + { + mHanziChar++; + } + else if (codePoints[i] <= 0x1F || codePoints[i] == 0x7F || /* C0 */ + (codePoints[i] <= 0x9F && codePoints[i] >= 0x80) || /* C1 */ + /* Separators: not strictly control characters for the Unicode + * standard, but we'll consider as such in our purpose. + */ + codePoints[i] == 0x2028 || codePoints[i] == 0x2029 || + /* Tags: U+E0001 is deprecated but other are still usable as + * emoji identifiers. Not sure how to use them. + */ + codePoints[i] == 0xE0001 || + /* Interlinear annotations. */ + codePoints[i] == 0xFFF9 || codePoints[i] == 0xFFFA || + codePoints[i] == 0xFFFB || + /* Bidirectional text control. */ + codePoints[i] == 0x061C || codePoints[i] == 0x200E || + codePoints[i] == 0x200F || + (codePoints[i] >= 0x202A && codePoints[i] <= 0x202E) || + (codePoints[i] >= 0x2066 && codePoints[i] <= 0x2069) || + /* Control pictures. */ + (codePoints[i] >= 0x2400 && codePoints[i] <= 0x2426)) + { + /* XXX: some control characters such as variation selectors may + * need to be considered separately (basically just as if they + * were not here and simply skipped?). */ + //mCtrlChar++; + } + /* When encountering an illegal codepoint, no need + * to continue analyzing data. It means this is not right, hence + * that the encoding we deducted this codepoint from is wrong. + * Unfortunately listing all illegal codePoints in Unicode might be + * a daunting task and comparing each characters to all these + * illegal codePoints would be a lot of additional work. Is it + * really necessary? XXX + */ + else if (/* Tab, line feed and carriage returns are common enough + * that they should be considered as commonly used characters. + */ + codePoints[i] == 0x9 || codePoints[i] == 0xA || codePoints[i] == 0xd || + (codePoints[i] >= 0x20 && codePoints[i] <= 0x40) || + (codePoints[i] >= 0x5B && codePoints[i] <= 0x5F) || + (codePoints[i] >= 0x7B && codePoints[i] <= 0x7E) || + (codePoints[i] >= 0xA0 && codePoints[i] <= 0xA5) || + (codePoints[i] >= 0xA0 && codePoints[i] <= 0xB4) || + (codePoints[i] >= 0xB6 && codePoints[i] <= 0xBF) || + codePoints[i] == 0xD7 || + codePoints[i] == 0xF7 || + /* General Punctuation */ + (codePoints[i] >= 0x2000 && codePoints[i] <= 0x206F) || + /* Vertical Forms */ + (codePoints[i] >= 0xFE10 && codePoints[i] <= 0xFE1F) || + /* CJK Symbols and Punctuation */ + (codePoints[i] >= 0x3000 && codePoints[i] <= 0x303F) || + /* Halfwidth and Fullwidth Forms */ + (codePoints[i] >= 0xFF00 && codePoints[i] <= 0xFFEF)) + { + /* Punctuations, various symbols, even numbers are simply + * ignored. + * As for halfwidth and fullwidth characters, I'm not sure what + * to do with them, but let's go with the same logics of + * skipping them, at least for now.. + */ + //mVariousBetween++; + } + else if (/* Common Ctrl except the ones considered as common chars. */ + (codePoints[i] >= 0x1F600 && codePoints[i] <= 0x1F64F) || + codePoints[i] == 0xFE0E || codePoints[i] == 0xFE0F || + (codePoints[i] >= 0x1F3FB && codePoints[i] <= 0x1F3FF) || + /* Miscellaneous Symbols */ + (codePoints[i] >= 0x2600 && codePoints[i] <= 0x26FF) || + /* Supplemental Symbols and Pictographs */ + (codePoints[i] >= 0x1F90C && codePoints[i] <= 0x1F93A) || + (codePoints[i] >= 0x1F93C && codePoints[i] <= 0x1F945) || + (codePoints[i] >= 0x1F947 && codePoints[i] <= 0x1F978) || + (codePoints[i] >= 0x1F97A && codePoints[i] <= 0x1F9CB) || + (codePoints[i] >= 0x1F9CD && codePoints[i] <= 0x1F9FF) || + /* Miscellaneous Symbols and Pictographs */ + (codePoints[i] >= 0x1F300 && codePoints[i] <= 0x1F5FF) || + /* Transport and Map Symbols */ + (codePoints[i] >= 0x1F680 && codePoints[i] <= 0x1F6FF) || + /* Dingbat */ + (codePoints[i] >= 0x2700 && codePoints[i] <= 0x27BF)) + { + //mEmoticons++; + } + else + { + /* All the rest is to be considered as non-frequent characters. + * These are not disqualifying (we may also have a text with a bit + * of foreign quotes in it or very unusual characters sometimes) + * but they will drop a bit the confidence. + */ + mOutChar++; + } + } + + if (mState == STATE_DETECTING) + if (mTotalChar > CJK_ENOUGH_CHAR_THRESHOLD) + { + ComputeConfidence(); + if (confidence > CJK_POSITIVE_SHORTCUT_THRESHOLD) + mState = STATE_FOUND; + else if (confidence < CJK_NEGATIVE_SHORTCUT_THRESHOLD) + mState = STATE_UNLIKELY; + } + + return mState; +} + +#include <cstdio> +float nsCJKDetector::GetConfidence(void) +{ + ComputeConfidence(); + + return confidence; +} + +const char* nsCJKDetector::GetLanguage() +{ + ComputeConfidence(); + + return language; +} + +void nsCJKDetector::ComputeConfidence(void) +{ + float confKo = 0.01f;; + float confJa = 0.01f;; + float confZh = 0.01f;; + float all_chars = (float) (mOutChar + mHanziChar + mHangulChar + mKanaChar); + float hangul_chars = (float) mHangulChar; + float hanzi_chars = (float) mHanziChar; + float kana_chars = (float) mKanaChar; + + language = NULL; + confidence = 0.01f; + + if (mTotalChar > 0) + { + confKo = hangul_chars / all_chars; + language = "ko"; + confidence = confKo; + + confZh = hanzi_chars / all_chars; + if (confZh > confKo) + { + language = "zh"; + confidence = confZh; + } + + /* Japanese still uses a lot of Chinese characters, so I think this + * very naive confidence computation will need to be revised soon. + * We should probably compute statistics of hanzi / (hanzi + kana) + * characters and use this as a weight modifier. + */ + confJa = (kana_chars + hanzi_chars / 2.0) / all_chars; + if (confJa > confidence) + { + language = "ja"; + confidence = confJa; + } + } +} diff --git a/src/nsCJKDetector.h b/src/nsCJKDetector.h new file mode 100644 index 0000000..6490aea --- /dev/null +++ b/src/nsCJKDetector.h @@ -0,0 +1,70 @@ +/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is Mozilla Universal charset detector code. This + * file was later added by Jehan in 2021 to add language detection. + * + * The Initial Developer of the Original Code is Netscape Communications + * Corporation. + * Portions created by the Initial Developer are Copyright (C) 2001 the + * Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Jehan <zemarmot.net> (2021) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ +#ifndef nsCJKDetector_h__ +#define nsCJKDetector_h__ + +#include "nsLanguageDetector.h" + +class nsCJKDetector: public nsLanguageDetector { +public: + nsCJKDetector(): nsLanguageDetector(nullptr) { Reset(); } + virtual ~nsCJKDetector() {} + + const char* GetLanguage(); + nsDetectState HandleData(const int* codepoints, PRUint32 cpLen); + float GetConfidence(void); + void Reset(void); + +protected: + /* Chinese characters (Kanji in Japanese) */ + PRUint32 mHanziChar; + /* Korean alphabet and syllabaries */ + PRUint32 mHangulChar; + /* Hiragana and Katakana (Japanese) */ + PRUint32 mKanaChar; + + const char* language; + float confidence; + +private: + + void ComputeConfidence(void); +}; + +#endif /* nsCJKDetector_h__ */ diff --git a/src/nsMBCSGroupProber.cpp b/src/nsMBCSGroupProber.cpp index 1b99da1..f19ec25 100644 --- a/src/nsMBCSGroupProber.cpp +++ b/src/nsMBCSGroupProber.cpp @@ -38,6 +38,7 @@ * ***** END LICENSE BLOCK ***** */ #include <stdio.h> +#include "nsCJKDetector.h" #include "nsMBCSGroupProber.h" #include "nsUniversalDetector.h" @@ -106,7 +107,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&HungarianModel); langDetectors[i][j++] = new nsLanguageDetector(&IrishModel); langDetectors[i][j++] = new nsLanguageDetector(&ItalianModel); - langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel); + //langDetectors[i][j++] = new nsLanguageDetector(&KoreanModel); langDetectors[i][j++] = new nsLanguageDetector(&LatvianModel); langDetectors[i][j++] = new nsLanguageDetector(&LithuanianModel); langDetectors[i][j++] = new nsLanguageDetector(&MalteseModel); @@ -120,6 +121,7 @@ nsMBCSGroupProber::nsMBCSGroupProber(PRUint32 aLanguageFilter) langDetectors[i][j++] = new nsLanguageDetector(&ThaiModel); langDetectors[i][j++] = new nsLanguageDetector(&TurkishModel); langDetectors[i][j++] = new nsLanguageDetector(&VietnameseModel); + langDetectors[i][j++] = new nsCJKDetector(); } else { |