summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2020-04-23 16:15:54 +0200
committerJehan <jehan@girinstud.io>2020-04-23 16:15:54 +0200
commit4b7b0476fb8d049f4229b6a8b49d4d474a865690 (patch)
tree99bd7525e0d3fd3de01917416c838212c950a9dd
parenta49f8ef6ea579a9665a7bd90207ef35b654bf9d3 (diff)
src: now reporting encoding+confidence and keeping a list.
Preparing for an updated API which will also allow to loop at the confidence value, as well as get the list of possible candidate (i.e. all detected encoding which had a confidence value high enough so that we would even consider them). It is still only internal logics though.
-rw-r--r--src/nsUniversalDetector.cpp23
-rw-r--r--src/nsUniversalDetector.h3
-rw-r--r--src/uchardet.cpp62
3 files changed, 62 insertions, 26 deletions
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 75474e0..2da4b4b 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -301,9 +301,12 @@ void nsUniversalDetector::DataEnd()
if (mDetectedCharset)
{
- mDone = PR_TRUE;
- Report(mDetectedCharset);
- return;
+ /* These cases are limited enough that we are always confident
+ * when finding them.
+ */
+ mDone = PR_TRUE;
+ Report(mDetectedCharset, 1.0);
+ return;
}
switch (mInputState)
@@ -311,24 +314,18 @@ void nsUniversalDetector::DataEnd()
case eHighbyte:
{
float proberConfidence;
- float maxProberConfidence = (float)0.0;
- PRInt32 maxProber = 0;
for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
{
if (mCharSetProbers[i])
{
proberConfidence = mCharSetProbers[i]->GetConfidence();
- if (proberConfidence > maxProberConfidence)
- {
- maxProberConfidence = proberConfidence;
- maxProber = i;
- }
+
+ if (proberConfidence > MINIMUM_THRESHOLD)
+ /* Only report what we are confident in. */
+ Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
}
}
- //do not report anything because we are not confident of it, that's in fact a negative answer
- if (maxProberConfidence > MINIMUM_THRESHOLD)
- Report(mCharSetProbers[maxProber]->GetCharSetName());
}
break;
case eEscAscii:
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index 9f0a4b1..eecdea6 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -69,7 +69,8 @@ public:
virtual void DataEnd(void);
protected:
- virtual void Report(const char* aCharset) = 0;
+ virtual void Report(const char* aCharset,
+ float confidence) = 0;
virtual void Reset();
nsInputState mInputState;
PRBool mNbspFound;
diff --git a/src/uchardet.cpp b/src/uchardet.cpp
index 46ee257..08eec17 100644
--- a/src/uchardet.cpp
+++ b/src/uchardet.cpp
@@ -37,45 +37,83 @@
#include "uchardet.h"
#include <string.h>
#include <stdlib.h>
+#include <vector>
#include "nscore.h"
#include "nsUniversalDetector.h"
+typedef struct _UChardetCandidate
+{
+ char *encoding;
+ char *language;
+ float confidence;
+} UChardetCandidate;
+
class HandleUniversalDetector : public nsUniversalDetector
{
protected:
- char *m_charset;
+ std::vector<UChardetCandidate> candidates;
public:
HandleUniversalDetector()
: nsUniversalDetector(NS_FILTER_ALL)
- , m_charset(0)
{
}
virtual ~HandleUniversalDetector()
{
- if (m_charset)
- free(m_charset);
+ Reset();
}
- virtual void Report(const char* charset)
+ virtual void Report(const char *encoding,
+ float confidence)
{
- if (m_charset)
- free(m_charset);
- m_charset = strdup(charset);
+ std::vector<UChardetCandidate>::iterator it;
+ UChardetCandidate candidate;
+
+ for (it = candidates.begin(); it != candidates.end(); it++)
+ {
+ if (strcmp(it->encoding, encoding) == 0)
+ {
+ /* Already reported. Bail out or update the confidence
+ * when needed.
+ */
+ if (confidence > it->confidence)
+ {
+ candidates.erase(it);
+ break;
+ }
+ else
+ {
+ return;
+ }
+ }
+ }
+
+ candidate = UChardetCandidate();
+ candidate.encoding = strdup(encoding);
+ candidate.confidence = confidence;
+
+ for (it = candidates.begin(); it != candidates.end(); it++)
+ {
+ if (it->confidence < confidence)
+ break;
+ }
+ candidates.insert(it, candidate);
}
virtual void Reset()
{
+ std::vector<UChardetCandidate>::iterator it;
+
nsUniversalDetector::Reset();
- if (m_charset)
- free(m_charset);
- m_charset = strdup("");
+ for (it = candidates.begin(); it != candidates.end(); it++)
+ free(it->encoding);
+ candidates.clear();
}
const char* GetCharset() const
{
- return m_charset? m_charset : "";
+ return (candidates.size() > 0) ? candidates[0].encoding : "";
}
};