src: new weight concept in the C API.

Pretty basic, you can weight prefered language and this will impact the result. Say the algorithm "hesitates" between encoding E1 in language L1 and encoding E2 in language L2. By setting L2 with a 1.1 weight, for instance because this is the OS language, or usual prefered language, you may help the algorithm to overcome very tight cases. It can also be helpful when you already know for sure the language of a document, you just don't know its encoding. Then you may set a very high value for this language, or simply set a default value of 0, and set 1 for this language. Only relevant encoding will be taken into account. This is still limited though as generic encoding are still implemented language-agnostic. UTF-8 for instance would be disadvantaged by this weight system until we make it language-aware.
author: Jehan <jehan@girinstud.io> 2020-04-27 18:07:32 +0200
committer: Jehan <jehan@girinstud.io> 2021-03-14 00:12:30 +0100
commit: 7f99b913882b16ab957189f67d802ecf02375f38 (patch)
tree: f198af1b5afded51d96d90f8259397bcae429306
parent: f15d097f29a753cb2f606bc460f0048f241a64d7 (diff)
3 files changed, 86 insertions, 4 deletions
diff --git a/src/symbols.cmake b/src/symbols.cmake
index e66bfa0..e5d2cc7 100644
--- a/src/symbols.cmake
+++ b/src/symbols.cmake
@@ -10,6 +10,8 @@ set(
     uchardet_get_encoding
     uchardet_get_confidence
     uchardet_get_language
+    uchardet_set_default_weight
+    uchardet_weigh_language
 )
 
 set (LINK_FLAGS "")
diff --git a/src/uchardet.cpp b/src/uchardet.cpp
index 19a73f0..b2207e3 100644
--- a/src/uchardet.cpp
+++ b/src/uchardet.cpp
@@ -37,6 +37,8 @@
 #include "uchardet.h"
 #include <string.h>
 #include <stdlib.h>
+#include <map>
+#include <string>
 #include <vector>
 #include "nscore.h"
 #include "nsUniversalDetector.h"
@@ -52,10 +54,13 @@ class HandleUniversalDetector : public nsUniversalDetector
 {
 protected:
     std::vector<UChardetCandidate> candidates;
+    std::vector<UChardetCandidate> weighed_candidates;
+    std::map<std::string, float> weights;
+    float default_weight;
 
 public:
     HandleUniversalDetector()
-    : nsUniversalDetector(NS_FILTER_ALL)
+    : nsUniversalDetector(NS_FILTER_ALL), default_weight(1.0)
     {
     }
 
@@ -102,6 +107,9 @@ public:
                 break;
         }
         candidates.insert(it, candidate);
+
+        if (weights.size() > 0)
+            WeighCandidates();
     }
 
     virtual void Reset()
@@ -123,21 +131,75 @@ public:
         return candidates.size();
     }
 
-    const char* GetCharset(size_t i) const
+    const char* GetCharset(size_t i)
     {
+        if (weights.size() > 0)
+            return (weighed_candidates.size() > i) ? weighed_candidates[i].encoding : "";
         return (candidates.size() > i) ? candidates[i].encoding : "";
     }
 
-    float GetConfidence(size_t i) const
+    float GetConfidence(size_t i)
     {
+        if (weights.size() > 0)
+            return (weighed_candidates.size() > i) ? weighed_candidates[i].confidence : 0.0;
         return (candidates.size() > i) ? candidates[i].confidence : 0.0;
     }
 
-    const char* GetLanguage(size_t i) const
+    const char* GetLanguage(size_t i)
     {
+        if (weights.size() > 0)
+            return (weighed_candidates.size() > i) ? weighed_candidates[i].language : NULL;
         return (candidates.size() > i) ? candidates[i].language : NULL;
     }
 
+    void WeighLanguage(const char *language,
+                       float       weight)
+    {
+        weights[language] = weight;
+        WeighCandidates();
+    }
+
+    void WeighDefault(float weight)
+    {
+        default_weight = weight;
+        WeighCandidates();
+    }
+
+private:
+
+    void WeighCandidates()
+    {
+        std::vector<UChardetCandidate>::iterator it;
+        std::vector<UChardetCandidate>::iterator it2;
+        UChardetCandidate                        candidate;
+
+        weighed_candidates.clear();
+        for (it = candidates.begin(); it != candidates.end(); it++)
+        {
+            std::map<std::string, float>::iterator weight_it;
+            float                                  confidence;
+
+            confidence = it->confidence * default_weight;
+            if (it->language)
+            {
+                weight_it = weights.find(it->language);
+                if (weight_it != weights.end())
+                    confidence = weight_it->second * it->confidence;
+            }
+
+            candidate = UChardetCandidate();
+            candidate.encoding   = it->encoding;
+            candidate.language   = it->language;
+            candidate.confidence = confidence;
+
+            for (it2 = weighed_candidates.begin(); it2 != weighed_candidates.end(); it2++)
+            {
+                if (it2->confidence < confidence)
+                    break;
+            }
+            weighed_candidates.insert(it2, candidate);
+        }
+    }
 };
 
 uchardet_t uchardet_new(void)
@@ -197,3 +259,16 @@ const char * uchardet_get_language (uchardet_t ud,
 {
     return reinterpret_cast<HandleUniversalDetector*>(ud)->GetLanguage(candidate);
 }
+
+void uchardet_weigh_language (uchardet_t  ud,
+                              const char *language,
+                              float       weight)
+{
+    reinterpret_cast<HandleUniversalDetector*>(ud)->WeighLanguage(language, weight);
+}
+
+void uchardet_set_default_weight (uchardet_t  ud,
+                                  float       weight)
+{
+    reinterpret_cast<HandleUniversalDetector*>(ud)->WeighDefault(weight);
+}
diff --git a/src/uchardet.h b/src/uchardet.h
index df1387e..d2299b6 100644
--- a/src/uchardet.h
+++ b/src/uchardet.h
@@ -123,6 +123,11 @@ UCHARDET_INTERFACE const char * uchardet_get_encoding   (uchardet_t ud,
 UCHARDET_INTERFACE const char * uchardet_get_language   (uchardet_t ud,
                                                          size_t     candidate);
 
+UCHARDET_INTERFACE void         uchardet_weigh_language (uchardet_t  ud,
+                                                         const char *language,
+                                                         float       weight);
+UCHARDET_INTERFACE void         uchardet_set_default_weight (uchardet_t  ud,
+                                                             float       weight);
 
 #ifdef __cplusplus
 }
author	Jehan <jehan@girinstud.io>	2020-04-27 18:07:32 +0200
committer	Jehan <jehan@girinstud.io>	2021-03-14 00:12:30 +0100
commit	7f99b913882b16ab957189f67d802ecf02375f38 (patch)
tree	f198af1b5afded51d96d90f8259397bcae429306
parent	f15d097f29a753cb2f606bc460f0048f241a64d7 (diff)