diff options
author | Jehan <jehan@girinstud.io> | 2020-04-23 16:15:54 +0200 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:23:13 +0100 |
commit | 15fc8f0a0f55d6ec9373cf16ed96ebf6b35feef3 (patch) | |
tree | 5ea7ccbab852f83fa04ef49a72f5d11848a5a6de | |
parent | 2f5c24006ebc7f005040358f58f22a61a3c92522 (diff) |
src: now reporting encoding+confidence and keeping a list.
Preparing for an updated API which will also allow to loop at the
confidence value, as well as get the list of possible candidate (i.e.
all detected encoding which had a confidence value high enough so that
we would even consider them).
It is still only internal logics though.
-rw-r--r-- | src/nsUniversalDetector.cpp | 23 | ||||
-rw-r--r-- | src/nsUniversalDetector.h | 3 | ||||
-rw-r--r-- | src/uchardet.cpp | 62 |
3 files changed, 62 insertions, 26 deletions
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 75474e0..2da4b4b 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -301,9 +301,12 @@ void nsUniversalDetector::DataEnd() if (mDetectedCharset) { - mDone = PR_TRUE; - Report(mDetectedCharset); - return; + /* These cases are limited enough that we are always confident + * when finding them. + */ + mDone = PR_TRUE; + Report(mDetectedCharset, 1.0); + return; } switch (mInputState) @@ -311,24 +314,18 @@ void nsUniversalDetector::DataEnd() case eHighbyte: { float proberConfidence; - float maxProberConfidence = (float)0.0; - PRInt32 maxProber = 0; for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++) { if (mCharSetProbers[i]) { proberConfidence = mCharSetProbers[i]->GetConfidence(); - if (proberConfidence > maxProberConfidence) - { - maxProberConfidence = proberConfidence; - maxProber = i; - } + + if (proberConfidence > MINIMUM_THRESHOLD) + /* Only report what we are confident in. */ + Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence); } } - //do not report anything because we are not confident of it, that's in fact a negative answer - if (maxProberConfidence > MINIMUM_THRESHOLD) - Report(mCharSetProbers[maxProber]->GetCharSetName()); } break; case eEscAscii: diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index 9f0a4b1..eecdea6 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -69,7 +69,8 @@ public: virtual void DataEnd(void); protected: - virtual void Report(const char* aCharset) = 0; + virtual void Report(const char* aCharset, + float confidence) = 0; virtual void Reset(); nsInputState mInputState; PRBool mNbspFound; diff --git a/src/uchardet.cpp b/src/uchardet.cpp index 46ee257..08eec17 100644 --- a/src/uchardet.cpp +++ b/src/uchardet.cpp @@ -37,45 +37,83 @@ #include "uchardet.h" #include <string.h> #include <stdlib.h> +#include <vector> #include "nscore.h" #include "nsUniversalDetector.h" +typedef struct _UChardetCandidate +{ + char *encoding; + char *language; + float confidence; +} UChardetCandidate; + class HandleUniversalDetector : public nsUniversalDetector { protected: - char *m_charset; + std::vector<UChardetCandidate> candidates; public: HandleUniversalDetector() : nsUniversalDetector(NS_FILTER_ALL) - , m_charset(0) { } virtual ~HandleUniversalDetector() { - if (m_charset) - free(m_charset); + Reset(); } - virtual void Report(const char* charset) + virtual void Report(const char *encoding, + float confidence) { - if (m_charset) - free(m_charset); - m_charset = strdup(charset); + std::vector<UChardetCandidate>::iterator it; + UChardetCandidate candidate; + + for (it = candidates.begin(); it != candidates.end(); it++) + { + if (strcmp(it->encoding, encoding) == 0) + { + /* Already reported. Bail out or update the confidence + * when needed. + */ + if (confidence > it->confidence) + { + candidates.erase(it); + break; + } + else + { + return; + } + } + } + + candidate = UChardetCandidate(); + candidate.encoding = strdup(encoding); + candidate.confidence = confidence; + + for (it = candidates.begin(); it != candidates.end(); it++) + { + if (it->confidence < confidence) + break; + } + candidates.insert(it, candidate); } virtual void Reset() { + std::vector<UChardetCandidate>::iterator it; + nsUniversalDetector::Reset(); - if (m_charset) - free(m_charset); - m_charset = strdup(""); + for (it = candidates.begin(); it != candidates.end(); it++) + free(it->encoding); + candidates.clear(); } const char* GetCharset() const { - return m_charset? m_charset : ""; + return (candidates.size() > 0) ? candidates[0].encoding : ""; } }; |