src: now reporting encoding+confidence and keeping a list.

Preparing for an updated API which will also allow to loop at the confidence value, as well as get the list of possible candidate (i.e. all detected encoding which had a confidence value high enough so that we would even consider them). It is still only internal logics though.
author: Jehan <jehan@girinstud.io> 2020-04-23 16:15:54 +0200
committer: Jehan <jehan@girinstud.io> 2022-12-14 00:23:13 +0100
commit: 15fc8f0a0f55d6ec9373cf16ed96ebf6b35feef3 (patch)
tree: 5ea7ccbab852f83fa04ef49a72f5d11848a5a6de
parent: 2f5c24006ebc7f005040358f58f22a61a3c92522 (diff)
3 files changed, 62 insertions, 26 deletions
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 75474e0..2da4b4b 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -301,9 +301,12 @@ void nsUniversalDetector::DataEnd()
 
   if (mDetectedCharset)
   {
-    mDone = PR_TRUE;
-    Report(mDetectedCharset);
-    return;
+      /* These cases are limited enough that we are always confident
+       * when finding them.
+       */
+      mDone = PR_TRUE;
+      Report(mDetectedCharset, 1.0);
+      return;
   }
 
   switch (mInputState)
@@ -311,24 +314,18 @@ void nsUniversalDetector::DataEnd()
   case eHighbyte:
     {
       float proberConfidence;
-      float maxProberConfidence = (float)0.0;
-      PRInt32 maxProber = 0;
 
       for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
       {
         if (mCharSetProbers[i])
         {
           proberConfidence = mCharSetProbers[i]->GetConfidence();
-          if (proberConfidence > maxProberConfidence)
-          {
-            maxProberConfidence = proberConfidence;
-            maxProber = i;
-          }
+
+          if (proberConfidence > MINIMUM_THRESHOLD)
+              /* Only report what we are confident in. */
+              Report(mCharSetProbers[i]->GetCharSetName(), proberConfidence);
         }
       }
-      //do not report anything because we are not confident of it, that's in fact a negative answer
-      if (maxProberConfidence > MINIMUM_THRESHOLD)
-        Report(mCharSetProbers[maxProber]->GetCharSetName());
     }
     break;
   case eEscAscii:
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index 9f0a4b1..eecdea6 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -69,7 +69,8 @@ public:
    virtual void DataEnd(void);
 
 protected:
-   virtual void Report(const char* aCharset) = 0;
+   virtual void Report(const char* aCharset,
+                       float       confidence) = 0;
    virtual void Reset();
    nsInputState  mInputState;
    PRBool  mNbspFound;
diff --git a/src/uchardet.cpp b/src/uchardet.cpp
index 46ee257..08eec17 100644
--- a/src/uchardet.cpp
+++ b/src/uchardet.cpp
@@ -37,45 +37,83 @@
 #include "uchardet.h"
 #include <string.h>
 #include <stdlib.h>
+#include <vector>
 #include "nscore.h"
 #include "nsUniversalDetector.h"
 
+typedef struct _UChardetCandidate
+{
+    char  *encoding;
+    char  *language;
+    float  confidence;
+} UChardetCandidate;
+
 class HandleUniversalDetector : public nsUniversalDetector
 {
 protected:
-    char *m_charset;
+    std::vector<UChardetCandidate> candidates;
 
 public:
     HandleUniversalDetector()
     : nsUniversalDetector(NS_FILTER_ALL)
-    , m_charset(0)
     {
     }
 
     virtual ~HandleUniversalDetector()
     {
-        if (m_charset)
-            free(m_charset);
+        Reset();
     }
 
-    virtual void Report(const char* charset)
+    virtual void Report(const char *encoding,
+                        float       confidence)
     {
-        if (m_charset)
-            free(m_charset);
-        m_charset = strdup(charset);
+        std::vector<UChardetCandidate>::iterator it;
+        UChardetCandidate                        candidate;
+
+        for (it = candidates.begin(); it != candidates.end(); it++)
+        {
+            if (strcmp(it->encoding, encoding) == 0)
+            {
+                /* Already reported. Bail out or update the confidence
+                 * when needed.
+                 */
+                if (confidence > it->confidence)
+                {
+                    candidates.erase(it);
+                    break;
+                }
+                else
+                {
+                    return;
+                }
+            }
+        }
+
+        candidate = UChardetCandidate();
+        candidate.encoding   = strdup(encoding);
+        candidate.confidence = confidence;
+
+        for (it = candidates.begin(); it != candidates.end(); it++)
+        {
+            if (it->confidence < confidence)
+                break;
+        }
+        candidates.insert(it, candidate);
     }
 
     virtual void Reset()
     {
+        std::vector<UChardetCandidate>::iterator it;
+
         nsUniversalDetector::Reset();
-        if (m_charset)
-            free(m_charset);
-        m_charset = strdup("");
+        for (it = candidates.begin(); it != candidates.end(); it++)
+            free(it->encoding);
+        candidates.clear();
     }
 
     const char* GetCharset() const
     {
-        return m_charset? m_charset : "";
+        return (candidates.size() > 0) ? candidates[0].encoding : "";
     }
 };
author	Jehan <jehan@girinstud.io>	2020-04-23 16:15:54 +0200
committer	Jehan <jehan@girinstud.io>	2022-12-14 00:23:13 +0100
commit	15fc8f0a0f55d6ec9373cf16ed96ebf6b35feef3 (patch)
tree	5ea7ccbab852f83fa04ef49a72f5d11848a5a6de
parent	2f5c24006ebc7f005040358f58f22a61a3c92522 (diff)