summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2022-12-18 22:28:54 +0100
committerJehan <jehan@girinstud.io>2022-12-18 22:33:12 +0100
commit0fe51d3851711efa771ebf5d7662849a57e3b208 (patch)
tree58aceb6c4afb659c938e0f41abc39f8297a00de4
parenta82139b3bd4fea54c94edd9c2632fed169472f70 (diff)
Issue #21: Greek CP737 support.
It actually breaks "zh:big5" so I'm going to hold-off a bit. Adding more language and charset support is slowly starting to show the limitations of our legacy multi-byte charset supports, since I haven't really touched these since the original implementation of Mozilla. It might be time to start reviewing these parts of the code. The test file contents comes from 'Μαρμότα' page on Wikipedia in Greek (though since 2 letters are missing in this encoding, despite its popularity for Greek, I had to be careful in choosing pieces of text without such letters).
-rw-r--r--README.md1
-rw-r--r--script/BuildLangModelLogs/LangGreekModel.log436
-rw-r--r--script/charsets/cp737.py79
-rw-r--r--script/langs/el.py4
-rw-r--r--src/LangModels/LangGreekModel.cpp344
-rw-r--r--src/nsSBCSGroupProber.cpp1
-rw-r--r--src/nsSBCharSetProber-generated.h3
-rw-r--r--test/el/cp737.txt1
8 files changed, 489 insertions, 380 deletions
diff --git a/README.md b/README.md
index 0b85469..ab200bb 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,7 @@ uchardet started as a C language binding of the original C++ implementation of t
* UTF-8
* ISO-8859-7
* WINDOWS-1253
+ * CP737
* Hebrew
* UTF-8
* ISO-8859-8
diff --git a/script/BuildLangModelLogs/LangGreekModel.log b/script/BuildLangModelLogs/LangGreekModel.log
index f81f77f..6a8a092 100644
--- a/script/BuildLangModelLogs/LangGreekModel.log
+++ b/script/BuildLangModelLogs/LangGreekModel.log
@@ -1,247 +1,231 @@
= Logs of language model for Greek (el) =
- Generated by BuildLangModel.py
-- Started: 2022-12-14 23:56:52.996274
+- Started: 2022-12-18 20:25:01.002309
- Maximum depth: 4
- Max number of pages: 200
== Parsed pages ==
-Πύλη:Κύρια (revision 9720674)
-Θάνατος της Μάχσα Αμίνι (revision 9785479)
-Ιράκ (revision 9784253)
-5ος αιώνας π.Χ. (revision 9503435)
-1960 (revision 9026602)
-4ος αιώνας π.Χ. (revision 9500049)
-Σάντα Μαρία ντελ Πόπολο (revision 9813223)
-Ρίσι Σούνακ (revision 9807035)
-Γαλλία (revision 9809487)
-Γενικός Γραμματέας του Κομμουνιστικού Κόμματος της Κίνας (revision 9790632)
-Ολυμπιακοί Αγώνες (revision 9767748)
-Αμπέμπε Μπικίλα (revision 9629134)
-Κεντρική Αμερική (revision 9436648)
-Ζήνων ο Ελεάτης (revision 9687689)
-25 Απριλίου (revision 9528548)
-Ιαπωνία (revision 9738909)
-Η Σταύρωση του Αγίου Πέτρου (revision 8027915)
-Δόλιχος (revision 9642281)
-Κατάρ (revision 9777831)
-Καραϊβική (revision 9776894)
-Πρωτεύουσα της Γαλλίας (revision 9694896)
-16 Οκτωβρίου (revision 9719896)
-Επισκοπή της Ρώμης (revision 8247187)
-8ος αιώνας π.Χ. (revision 9509177)
-1516 (revision 8891470)
-Συριακή Ορθόδοξη Εκκλησία (revision 8814522)
-Θεόφραστος (revision 9645199)
-16 Σεπτεμβρίου (revision 9054145)
-Ατλαντικός Ωκεανός (revision 9450595)
-Ηνωμένο Βασίλειο (revision 9738607)
-Καναδάς (revision 9736902)
-Νότια Ασία (revision 9718470)
-Άρθουρ Γουέλσλι, δούκας του Ουέλλινγκτον (revision 9810101)
-Βιετνάμ (revision 9767839)
-Μεροβίγγειοι (revision 9720122)
-Ντομινίκ Γουίλκινς (revision 9798740)
-BIBSYS (revision 9155553)
-Γουατεμάλα (revision 9770327)
-Αγγλική γλώσσα (revision 9779698)
-Αρκτικός Ωκεανός (revision 9450607)
-Αθλητισμός (revision 9816520)
-Δυτική Ευρώπη (revision 9667409)
-Αγγλία (revision 9730532)
-Λεωτυχίδας ο Λακεδαιμόνιος (revision 9655599)
-Καινή Διαθήκη (revision 9725581)
-1660 (revision 7905687)
-Ευρασία-Αφρική (revision 9667364)
-Άμστερνταμ (revision 9701304)
-Πολυνησία (revision 9667374)
-Άρσης (revision 8381416)
-Τασκένδη (revision 9674027)
-Système universitaire de documentation (revision 9519040)
-1964 (revision 9811809)
-COVID-19 (revision 9751162)
-Σεισμός (revision 9555986)
-Μαρόκο (revision 9816451)
-Ευρωπαϊκή Ένωση (revision 9807037)
-Αρχαιοελληνική πυγμαχία (revision 9008193)
-Άντονι Ήντεν (revision 9752041)
-11 Αυγούστου (revision 8988727)
-2006 (revision 9797947)
+Πρωτεύοντα (revision 9792164)
+Ευαρχοντομυωξοί (revision 9475530)
+Φολιδωτά (θηλαστικά) (revision 8966182)
+Ανθρώπινη εξέλιξη (revision 9731824)
+Υδατάνθρακες (revision 9276169)
+Άνθρωπος (revision 9804050)
+National Library of the Czech Republic (revision 9499518)
+Ταξινομία (revision 6174527)
+Δεοξυριβόζη (revision 9735675)
+Συστηματική ταξινόμηση (revision 9163863)
+Οικογένεια (βιολογία) (revision 8380547)
+Μονοσακχαρίτης (revision 8520367)
+Ευλιπότυφλα (revision 8635098)
+Γαλάγος (revision 9624211)
+Ανθρωποειδή (revision 9802784)
+Μυρμήγκι (revision 9743672)
+Primates (revision 9792164)
+Εθνική Βιβλιοθήκη της Μποτσουάνα (revision 9771961)
+Εθνική Βιβλιοθήκη της Σλοβακίας (revision 9545464)
+Κίνα (revision 9794230)
+Μονοσακχαρίτες (revision 8520367)
+Άνθρακας (revision 9698608)
+Τερμίτης (revision 8570600)
+Virtual International Authority File (revision 9547787)
+Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547)
+International Union for Conservation of Nature (revision 9555075)
+Neogene (revision 7970278)
+Ανθρωπoειδή (revision 9802784)
+Λάρυγγας (revision 8037233)
+Θηλαστικά (revision 9802762)
+IUCN Red List (revision 9104016)
+Δισακχαρίτης (revision 9301054)
+Ινσουλίνη (revision 9193560)
+Αρχαϊκοί Homo sapiens (revision 9496339)
+Εθνική Βιβλιοθήκη της Μοζαμβίκης (revision 9771960)
+Εθνική Βιβλιοθήκη της Πολωνίας (revision 9771967)
+Ολιγοσακχαρίτης (revision 9784937)
+Θεσμός (revision 9409922)
+Μοριακό βάρος (revision 8588261)
+Παράνθρωποι (revision 9187211)
+Χρονολόγιο της ανθρώπινης εξέλιξης (revision 9494488)
+Κοινός πρόγονος (revision 7955205)
+Ασία (revision 9640488)
+Εθνική Βιβλιοθήκη του Βανουάτου (revision 9510031)
+Συνομοταξία (revision 8090691)
+Διαδίκτυο (revision 9818610)
+Τριγλυκερίδιο (revision 8991916)
+Εθνική Βιβλιοθήκη της Λετονίας (revision 9736743)
+Εθνική βιβλιοθήκη της Σουηδίας (revision 9741133)
+Ζώα (revision 9797988)
+Απειλούμενα είδη (revision 9387012)
+Εθνική Βιβλιοθήκη της Μιανμάρ (revision 9771959)
+Silurian (revision 7083264)
+Γερμανική γλώσσα (revision 9768836)
+Ζωολογία (revision 9597532)
+Σπονδυλωτά (revision 8936763)
+Χορδωτά (revision 9800855)
+Εθνική Βιβλιοθήκη του Ελ Σαλβαδόρ (revision 9608126)
+Μακρομόρια (revision 8962637)
+Homo sapiens (revision 9804050)
+Γλυκίδια (revision 8976376)
Κατάλογος καθιερωμένων όρων (revision 9747802)
-Ρωσία (revision 9756811)
-Μπιτς βόλεϊ (revision 9629816)
-Αραβική γλώσσα (revision 9758388)
-7ος αιώνας π.Χ. (revision 9509175)
+Paleogene (revision 7772183)
+Γένος (βιολογία) (revision 8620951)
+Βραδυποδόμορφα (revision 8793874)
+Εθνική Βιβλιοθήκη της Ουκρανίας (revision 9818749)
+Περίοδος (γεωλογία) (revision 9598229)
+Γραμμομόριο (revision 9175982)
+Νουκλεϊκά οξέα (revision 9020237)
+Γάλα (revision 9473543)
+Μετάλλαξη (revision 9662655)
+Γαλακτόζη (revision 8983758)
+Φάλαινα (revision 9455804)
+Εθνική Βιβλιοθήκη της Ισπανίας (revision 9771953)
+Γλυκογόνο (revision 8033277)
+Ισπανική γλώσσα (revision 9751022)
+Φωνητικές χορδές (revision 9179304)
+Κανονικές συνθήκες (revision 9776846)
+Άλπεις (revision 9759633)
+Αντίδραση συμπύκνωσης (revision 8965637)
+National Diet Library (revision 9533181)
+Εθνική Βιβλιοθήκη της Βραζιλίας (revision 9516238)
+Homo sapiens sapiens (revision 9804050)
+Εθνική Βιβλιοθήκη των Μπαρμπάντος (revision 9608141)
+Μουντάνεουμ (revision 9387431)
+Ζώο (revision 9797988)
+Εθνική και Πανεπιστημιακή Βιβλιοθήκη της Ισλανδίας (revision 9510045)
+Μόριο (revision 9737689)
+Εθνική Βιβλιοθήκη της Ανδόρας (revision 9771949)
+Βασίλειο (βιολογία) (revision 9171746)
+Εθνική και Πανεπιστημιακή Βιβλιοθήκη «Άγιος Κλήμης της Αχρίδας» (revision 9608210)
+Κλίμα (revision 9262599)
+Δακτυλιοσκώληκες (revision 8985128)
+Ασπάλακας (revision 9429446)
+Μόλυνση (revision 8512424)
International Standard Name Identifier (revision 6861942)
-Ελευθερία, Ισότητα, Αδελφοσύνη (revision 8591437)
-Συμβούλιο των Αντιπροσώπων του Ιράκ (revision 9812554)
-Βιοεπιστήμες (revision 9175912)
-21 Οκτωβρίου (revision 9123772)
-Καμήλα (revision 9815157)
-1526 (revision 7905955)
-Μικτή οικονομία (revision 9756694)
-Εμπραχίμ Ραΐσι (revision 9662244)
-Βαθυσκάφος (revision 8755412)
-Γιοχάνεσμπουργκ (revision 9234192)
-Γιαζίντι (revision 9251594)
-Πακιστάν (revision 9719399)
-27 Νοεμβρίου (revision 9168386)
-25 Οκτωβρίου (revision 9732587)
-Ειρηνικός Ωκεανός (revision 9659005)
-Βία κατά των γυναικών (revision 9404071)
-Γιανίκ Νοά (revision 9721039)
-Μεξικό (revision 9672139)
-Ευρώπη (revision 9806540)
-Ακκάδιοι (revision 8546428)
-Είλωτες (revision 9472621)
-Λεύκιππος (φιλόσοφος) (revision 8933581)
-Δόλοπες (revision 9091900)
-Ήλιος (revision 9797813)
-Αντίς Αμπέμπα (revision 9703571)
-4 Απριλίου (revision 9797052)
-Μεγασθένης (revision 9397713)
-Ισλαμική Συμβουλευτική Συνέλευση (revision 6895099)
-Αμχαρική γλώσσα (revision 8252762)
-Προφήτης Ιωνάς (revision 8981060)
-Μεσοποταμία (revision 9478563)
-Deutsche Welle (revision 9697594)
-Θεοδόσιος Α' (revision 9717330)
-Τανζανία (revision 9672789)
-Μπαρόκ (revision 9498929)
-Διαδηλώσεις στο Ιράν για το θάνατο της Μάχσα Αμίνι (revision 9746434)
-Ρώμη (revision 9684590)
-Εκκλησία (αρχιτεκτονική) (revision 9466423)
-Integrated Authority File (revision 8518544)
-Μπραζίλια (revision 9696231)
-2η χιλιετία (revision 9650679)
-Ιράν (revision 9804479)
-Ολυμπιονίκης (revision 9767748)
-The Guardian (revision 9533576)
-Ανεξαρτησία (revision 9730220)
-Σεισμός στο Αγκαντίρ (1960) (revision 9042146)
-Γκέμπχαρντ Λέμπερεχτ φον Μπλύχερ (revision 9711616)
-Αντρές Μανουέλ Λόπες Ομπραδόρ (revision 9591461)
-Ηνωμένα Έθνη (revision 9596090)
-Καρλομάγνος (revision 9776930)
-Διάδης ο Πελλαίος (revision 8453992)
-Ισλάμ (revision 9535017)
-International Standard Serial Number (revision 9426410)
-Πολ Βερλέν (revision 9620098)
-Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547)
-Παντζάμποι (revision 9410265)
-Θερινοί Ολυμπιακοί Αγώνες 2004 (revision 9646436)
-Κεντρική Αφρική (revision 9666820)
-1695 (revision 7905645)
-1841 (revision 9476734)
-Καράτζ (revision 7678423)
-Αρχιτεκτονική (revision 9699724)
-Κάλι Γιούγκα (revision 9173397)
-Νέπιντο (revision 9786205)
-Ησαΐας (revision 9285628)
-Μιγκέλ Ιδάλγο ι Κοστίγια (revision 6788184)
-Λατινικά (revision 9751029)
-Ριγιάλ του Κατάρ (revision 9194726)
-Καπιταλισμός (revision 9353276)
-Σύστημα ταξινόμησης βιβλιοθήκης (revision 9648804)
-Εκαταίος ο Μιλήσιος (revision 9332195)
-Χαντίθ (revision 9423173)
-24 Οκτωβρίου (revision 9123843)
-Ατρείδες (revision 9269517)
-Νερό (revision 9799579)
-Χριστιανισμός στην Συρία (revision 9504011)
+Υδροξυλομάδα (revision 9719647)
+Εθνική Βιβλιοθήκη του Κουβέιτ (revision 9511761)
+Homo rhodesiensis (revision 7605622)
+Αγγλική γλώσσα (revision 9779698)
+Περιβαλλοντική εκπαίδευση (revision 7971138)
+Γουανίνη (revision 8392293)
+Γριβάδι (revision 9370003)
+Διεθνής Επιτροπή Στρωματογραφίας (revision 9796210)
+Εχινόδερμα (revision 9101031)
+Εθνική Βιβλιοθήκη των Φιλιππινών (revision 9511751)
+Αρτίγονος (revision 9753577)
+Εθνική Βιβλιοθήκη της Σρι Λάνκα (revision 9511705)
+Περιβαλλοντικά προβλήματα (revision 9555971)
+Υπερτάξη (revision 7554395)
+Κατάλογος αντιστοιχίας Λατινικών-Ελληνικών όρων ταξινομικών μονάδων (revision 9562399)
+Κόκκινος κατάλογος της IUCN (revision 9104016)
+Κοινοβουλευτική Βιβλιοθήκη της Γεωργίας (revision 9508234)
+Ασπόνδυλα (revision 9049085)
+Τάξη (βιολογία) (revision 7554395)
+Γρυλοβλαττοειδή (revision 6401187)
+Γλυκόζη (revision 9770284)
+Τουρκικές γλώσσες (revision 9284882)
+Εκπνοή (revision 9611418)
+Ανθρωπίνοι (revision 9103976)
+Εθνική Βιβλιοθήκη του Μαυρικίου (revision 9736776)
+Σαρκοφάγα (revision 8222140)
+Χημική ένωση (revision 9478321)
+Νουκλεοτίδια (revision 8520133)
+Πλειστόκαινο (revision 9225169)
+Υποοικογένεια (revision 8380547)
+Πόδι (έντομα) (revision 7865328)
+Δημόσια Βιβλιοθήκη Τσαρλς Α. Χάλμπερτ (revision 9607718)
+Δισακχαρίτες (revision 9301054)
+Νορβηγική γλώσσα (revision 9527903)
+Σορβόζη (revision 9702780)
+Bibliothèque nationale de France (revision 9636186)
+1778 (revision 9509259)
+Αμυλοπηκτίνη (revision 7348804)
+Υφομοταξία (revision 9796614)
+Κοινή καταγωγή (revision 7955205)
+Βιβλιοθήκη του Βατικανού (revision 9791596)
+Κράμα (revision 8491814)
+Orrorin tugenensis (revision 8021796)
+Εθνική Βιβλιοθήκη της Γερμανίας (revision 9533197)
Εθνική Βιβλιοθήκη της Ελλάδος (revision 9771951)
-Οθωμανικός Στρατός (revision 8724392)
-Κάτεγατ (revision 9501508)
-Βερμούδες (revision 9767755)
-Floruit (revision 9328997)
-Τρανσυλβανία (revision 9743855)
-Κρυπτεία (revision 9689259)
-Δημοκρατία της Ιρλανδίας (revision 9672626)
-Κούρδοι (revision 9814315)
-Οθέλλος (revision 9168875)
-Κώμα (revision 7363577)
-Συρία (revision 9703382)
-Ευρασία (revision 9667362)
-Θάλασσα του Σολομώντα (revision 9466850)
-Πόλη του Μεξικού (revision 9692487)
-Σημιτικές γλώσσες (revision 9595481)
-Ισραήλ (revision 9777728)
-Ναβουχοδονόσορ Β' (revision 9785243)
-Αρδέννες (revision 9438776)
-Λοτζ (revision 9812741)
-Παρθένοι Νήσοι (revision 9466868)
-Κεντρική Ευρώπη (revision 9712626)
-Λάπις λάζουλι (revision 9356278)
-Πόλεμος του Έβδομου Συνασπισμού (revision 9526274)
-Βέλγιο (revision 9785761)
-Γκιμαράες (revision 9256928)
-Σπονδυλική στήλη (revision 9772196)
-Κομμούνα του Παρισιού (Γαλλική επανάσταση) (revision 8737961)
-Κοινοπολιτεία των Εθνών (revision 9713895)
-Τηλεμετρία (revision 9300418)
-Β΄ Παγκόσμιος Πόλεμος (revision 9796929)
-Σουηδία (revision 9724663)
+Κάρολος Λινναίος (revision 9170651)
+Εθνική Βιβλιοθήκη της Μαυριτανίας (revision 9771958)
+Εθνική Βιβλιοθήκη της Σαουδικής Αραβίας (revision 9777111)
+Εθνική Βιβλιοθήκη της Ιορδανίας (revision 9510012)
+Κλαδιστική (revision 7593647)
+Κετόζες (revision 9015709)
+Υδροξύλιο (revision 9719647)
+Απειλούμενο είδος (revision 9387012)
+Νέφος (revision 9753949)
+Κατάρρινοι (revision 9802799)
+Επικοινωνία (revision 9810024)
+Χημικός τύπος (revision 9478340)
+Εθνικά Αρχεία και Βιβλιοθήκη της Αιθιοπίας (revision 9608078)
+Ολιγόκαινος εποχή (revision 8882927)
== End of Parsed pages ==
-- Wikipedia parsing ended at: 2022-12-15 00:00:15.230612
+- Wikipedia parsing ended at: 2022-12-18 20:30:49.244663
-63 characters appeared 1687423 times.
+62 characters appeared 918903 times.
Most Frequent characters:
-[ 0] Char α: 9.291090615690315 %
-[ 1] Char ο: 8.043092929277366 %
-[ 2] Char τ: 7.9854310389274055 %
-[ 3] Char ι: 6.7272995567797755 %
-[ 4] Char ν: 6.033816061532882 %
-[ 5] Char ε: 5.973842954611855 %
-[ 6] Char ρ: 4.455077357603873 %
-[ 7] Char σ: 4.30638909153188 %
-[ 8] Char κ: 4.299455441818679 %
-[ 9] Char η: 3.817951989513003 %
-[10] Char ς: 3.5992160827486646 %
-[11] Char π: 3.4671804283810284 %
-[12] Char μ: 3.293483613770821 %
-[13] Char υ: 3.188708462549106 %
-[14] Char λ: 2.825551151074745 %
-[15] Char ί: 2.374330562046387 %
-[16] Char ό: 1.996061449915048 %
-[17] Char ά: 1.9162948472315477 %
-[18] Char γ: 1.7252935393200164 %
-[19] Char έ: 1.6340893777079015 %
-[20] Char δ: 1.4231167881438145 %
-[21] Char ω: 1.3993527408361743 %
-[22] Char ή: 1.3272309314262043 %
-[23] Char χ: 1.1665717487553506 %
-[24] Char ύ: 1.0015271807958053 %
-[25] Char θ: 0.9180863363839417 %
-[26] Char β: 0.8104666109209131 %
-[27] Char ώ: 0.779828175863432 %
-[28] Char φ: 0.699231905692882 %
-[29] Char ξ: 0.37246143972199025 %
-[30] Char ζ: 0.29607276895004986 %
-[31] Char e: 0.2829166130839748 %
-[32] Char a: 0.25891551792289186 %
-[33] Char i: 0.21506166503597496 %
-[34] Char n: 0.19153466558177767 %
-[35] Char r: 0.19141614165505627 %
-[36] Char o: 0.18294168089447638 %
-[37] Char s: 0.17677843670496374 %
-[38] Char t: 0.15899984769675415 %
-[39] Char l: 0.12883550834615862 %
-[40] Char c: 0.10969389418065299 %
-[41] Char d: 0.10281950643081196 %
-[42] Char ψ: 0.09772297758179188 %
+[ 0] Char α: 9.042630179681641 %
+[ 1] Char ο: 7.761537398397872 %
+[ 2] Char τ: 7.389680956531865 %
+[ 3] Char ι: 7.071584269503963 %
+[ 4] Char ν: 6.1224090029089036 %
+[ 5] Char ε: 5.937188147171138 %
+[ 6] Char κ: 4.257359046602308 %
+[ 7] Char ρ: 4.217311294010358 %
+[ 8] Char σ: 4.050373107934135 %
+[ 9] Char η: 3.6424954538183028 %
+[10] Char π: 3.53171118170253 %
+[11] Char ς: 3.3343018795237365 %
+[12] Char μ: 3.2733596473185957 %
+[13] Char υ: 3.02023173283796 %
+[14] Char λ: 2.6589313561932 %
+[15] Char ί: 2.381426548830508 %
+[16] Char ό: 1.9545044471505697 %
+[17] Char ά: 1.8594998601593422 %
+[18] Char γ: 1.7558980654106038 %
+[19] Char δ: 1.6237840120230318 %
+[20] Char έ: 1.569806606355622 %
+[21] Char ω: 1.5474973963519545 %
+[22] Char ή: 1.323969994656672 %
+[23] Char χ: 1.1194870405254962 %
+[24] Char ύ: 1.0730185884690766 %
+[25] Char θ: 1.0217618181679675 %
+[26] Char ώ: 0.7902901612030867 %
+[27] Char φ: 0.7704839357364162 %
+[28] Char β: 0.7675456495408112 %
+[29] Char ξ: 0.4437900409510035 %
+[30] Char ζ: 0.4305133403634551 %
+[31] Char a: 0.4036334629444022 %
+[32] Char e: 0.39601568391875963 %
+[33] Char i: 0.3618445037180203 %
+[34] Char n: 0.3161378295641651 %
+[35] Char o: 0.31026125717295516 %
+[36] Char s: 0.2842519830711185 %
+[37] Char r: 0.2519308349194638 %
+[38] Char t: 0.23560702272165832 %
+[39] Char l: 0.20110936627696285 %
+[40] Char c: 0.19925933422787825 %
+[41] Char h: 0.1609527882703615 %
+[42] Char d: 0.14419367441394795 %
+[43] Char u: 0.13690237163226152 %
+[44] Char m: 0.1365758953883054 %
+[45] Char p: 0.11328725665276966 %
+[46] Char ψ: 0.10240471518756604 %
-The first 43 characters have an accumulated ratio of 0.9924523963463813.
-The first 6 characters have an accumulated ratio of 0.440545731568196.
-All characters whose order is over 27 have an accumulated ratio of 0.03465402569480207.
+The first 47 characters have an accumulated ratio of 0.9949875014011275.
+The first 6 characters have an accumulated ratio of 0.4332502995419538.
+All characters whose order is over 31 have an accumulated ratio of 0.03350734517136193.
-1515 sequences found.
+1389 sequences found.
-First 852 (typical positive ratio): 0.9950198012242328
-Next 229 (1081-852): 0.003981133733535591
-Rest: 0.0009990650422315728
+First 849 (typical positive ratio): 0.9950207709120384
+Next 223 (1072-849): 0.003984435961508326
+Rest: 0.0009947931264532306
-- Processing end: 2022-12-15 00:00:15.353968
+- Processing end: 2022-12-18 20:30:49.348223
diff --git a/script/charsets/cp737.py b/script/charsets/cp737.py
new file mode 100644
index 0000000..be9c4bc
--- /dev/null
+++ b/script/charsets/cp737.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+# NOTE: I use CP737 and not IBM737 as the main encoding name, since iconv
+# conversion failed with IBM737 with the file from #21 and in BuildLangModel.py
+# script, even though these are supposed to be synonyms.
+name = 'CP737'
+aliases = ['IBM737', 'OEM 737', 'MS-DOS Greek']
+
+language = \
+{
+ # Wikipedia tells us: Code page 737 (CCSID 737) (also known as CP 737,
+ # IBM 00737, and OEM 737, MS-DOS Greek) is a code page used under DOS to
+ # write the Greek language.[4] It was much more popular than code page
+ # 869 although it lacks the letters ΐ and ΰ.
+ 'complete': [ 'el' ],
+ 'incomplete': []
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ LET,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
+]
diff --git a/script/langs/el.py b/script/langs/el.py
index 4c8352b..8bc6406 100644
--- a/script/langs/el.py
+++ b/script/langs/el.py
@@ -45,11 +45,11 @@ import re
name = 'Greek'
code = 'el'
use_ascii = False
-charsets = ['ISO-8859-7', 'WINDOWS-1253']
+charsets = ['ISO-8859-7', 'WINDOWS-1253', 'CP737']
## Optional Properties ##
alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
-start_pages = ['Πύλη:Κύρια']
+start_pages = ['Πρωτεύοντα']
wikipedia_code = code
case_mapping = True
diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp
index 4825977..14ada24 100644
--- a/src/LangModels/LangGreekModel.cpp
+++ b/src/LangModels/LangGreekModel.cpp
@@ -38,50 +38,51 @@
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Greek *********/
/**
* Generated by BuildLangModel.py
- * On: 2022-12-15 00:00:15.231612
+ * On: 2022-12-18 20:30:49.245016
**/
-/* Character Mapping Table:
- * ILL: illegal character.
- * CTR: control character specific to the charset.
- * RET: carriage/return.
- * SYM: symbol (punctuation) that does not belong to word.
- * NUM: 0 - 9.
- *
- * Other characters are ordered by probabilities
- * (0 is the most common character in the language).
- *
- * Orders are generic to a language. So the codepoint with order X in
- * CHARSET1 maps to the same character as the codepoint with the same
- * order X in CHARSET2 for the same language.
- * As such, it is possible to get missing order. For instance the
- * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
- * even though they are both used for French. Same for the euro sign.
- */
-static const unsigned char Iso_8859_7_CharToOrderMap[] =
+ /* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+ static const unsigned char Iso_8859_7_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */
- 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */
- 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */
- 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */
- 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */
+ SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */
+ 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */
+ 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */
+ 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */
+ 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -91,134 +92,164 @@ static const unsigned char Windows_1253_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 9X */
SYM,SYM, 17,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */
- 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */
- 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */
- 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */
- 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */
+ 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */
+ 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */
+ 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */
+ 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Cp737_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, 10, /* 8X */
+ 7, 8, 2, 13, 27, 23, 46, 21, 0, 28, 18, 19, 5, 30, 9, 25, /* 9X */
+ 3, 6, 14, 12, 4, 29, 1, 10, 7, 8, 11, 2, 13, 27, 23, 46, /* AX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
+ 21, 17, 20, 22, 49, 15, 16, 24, 60, 26, 17, 20, 22, 15, 16, 24, /* EX */
+ 26,SYM,SYM,SYM, 49, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
-static const int Unicode_Char_size = 86;
+static const int Unicode_Char_size = 94;
static const unsigned int Unicode_CharOrder[] =
{
- 65, 32, 67, 40, 68, 41, 69, 31, 73, 33, 76, 39, 78, 34, 79, 36,
- 82, 35, 83, 37, 84, 38, 97, 32, 99, 40, 100, 41, 101, 31,105, 33,
- 108, 39, 110, 34, 111, 36, 114, 35, 115, 37, 116, 38, 902, 17,904, 19,
- 905, 22, 906, 15, 908, 16, 910, 24, 911, 27, 913, 0, 914, 26,915, 18,
- 916, 20, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 8,923, 14,
- 924, 12, 925, 4, 926, 29, 927, 1, 928, 11, 929, 6, 931, 7,931, 10,
- 932, 2, 933, 13, 934, 28, 935, 23, 936, 42, 937, 21, 940, 17,941, 19,
- 942, 22, 943, 15, 945, 0, 946, 26, 947, 18, 948, 20, 949, 5,950, 30,
- 951, 9, 952, 25, 953, 3, 954, 8, 955, 14, 956, 12, 957, 4,958, 29,
- 959, 1, 960, 11, 961, 6, 962, 10, 963, 7, 964, 2, 965, 13,966, 28,
- 967, 23, 968, 42, 969, 21, 972, 16, 973, 24, 974, 27,
+ 65, 31, 67, 40, 68, 42, 69, 32, 72, 41, 73, 33, 76, 39, 77, 44,
+ 78, 34, 79, 35, 80, 45, 82, 37, 83, 36, 84, 38, 85, 43, 97, 31,
+ 99, 40, 100, 42, 101, 32, 104, 41, 105, 33, 108, 39, 109, 44,110, 34,
+ 111, 35, 112, 45, 114, 37, 115, 36, 116, 38, 117, 43, 902, 17,904, 20,
+ 905, 22, 906, 15, 908, 16, 910, 24, 911, 26, 913, 0, 914, 28,915, 18,
+ 916, 19, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 6,923, 14,
+ 924, 12, 925, 4, 926, 29, 927, 1, 928, 10, 929, 7, 931, 8,931, 11,
+ 932, 2, 933, 13, 934, 27, 935, 23, 936, 46, 937, 21, 940, 17,941, 20,
+ 942, 22, 943, 15, 945, 0, 946, 28, 947, 18, 948, 19, 949, 5,950, 30,
+ 951, 9, 952, 25, 953, 3, 954, 6, 955, 14, 956, 12, 957, 4,958, 29,
+ 959, 1, 960, 10, 961, 7, 962, 11, 963, 8, 964, 2, 965, 13,966, 27,
+ 967, 23, 968, 46, 969, 21, 972, 16, 973, 24, 974, 26,
};
-/* Model Table:
- * Total considered sequences: 1515 / 1849
- * - Positive sequences: first 852 (0.9950198012242328)
- * - Probable sequences: next 229 (1081-852) (0.003981133733535591)
- * - Neutral sequences: last 768 (0.0009990650422315728)
- * - Negative sequences: 334 (off-ratio)
- * Negative sequences: TODO
+ /* Model Table:
+ * Total considered sequences: 1389 / 2209
+ * - Positive sequences: first 849 (0.9950207709120384)
+ * - Probable sequences: next 223 (1072-849) (0.003984435961508326)
+ * - Neutral sequences: last 1137 (0.0009947931264532306)
+ * - Negative sequences: 820 (off-ratio)
+ * Negative sequences: TODO
*/
static const PRUint8 GreekLangModel[] =
{
- 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,
- 0,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,
- 2,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,1,3,3,1,3,3,3,3,3,3,2,3,0,
- 3,3,2,3,3,2,3,2,0,3,0,0,0,0,0,2,0,0,0,0,0,0,
- 3,3,3,2,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,
- 3,3,3,1,3,3,3,3,3,3,0,0,0,0,0,1,0,0,0,0,0,3,
- 3,3,3,3,3,3,2,3,3,3,3,1,2,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,1,3,2,1,3,0,1,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,
- 3,2,3,3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,2,0,0,0,0,0,1,0,0,0,0,0,1,
- 3,3,3,3,2,3,3,3,3,3,0,3,3,3,3,3,3,3,2,3,3,
- 3,3,3,3,3,3,3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,1,3,3,
- 3,3,2,3,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,3,1,3,3,3,3,3,0,3,3,3,0,3,1,0,0,3,1,3,
- 0,0,3,1,3,2,0,3,3,0,1,0,0,0,0,0,0,0,0,0,0,3,
- 1,1,1,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,
- 0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,1,1,3,1,3,0,3,3,3,3,3,1,3,1,
- 3,3,1,3,1,1,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,2,3,3,3,1,3,1,3,2,3,3,3,1,3,3,3,1,3,0,
- 3,3,1,3,0,3,3,3,0,1,0,0,0,0,0,1,1,0,0,0,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,
- 0,3,3,0,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,1,3,3,3,2,3,3,3,3,3,3,3,3,3,3,
- 3,3,2,3,3,3,3,3,1,1,0,0,0,0,0,1,0,0,0,0,0,2,
- 3,3,3,1,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,0,3,
- 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 2,3,3,2,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,0,3,
- 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,1,3,
- 1,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,1,3,3,3,3,0,3,3,0,0,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,1,3,3,3,1,3,0,0,0,3,0,3,
- 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,0,3,2,3,3,0,1,3,0,0,1,3,0,3,3,3,1,3,1,
- 3,3,0,3,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,2,3,2,3,1,3,3,3,1,3,3,3,1,3,2,1,3,3,1,3,
- 0,3,2,0,3,3,1,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,3,2,3,2,3,3,3,1,3,3,3,0,3,0,0,0,3,1,3,
- 0,0,3,0,3,1,0,3,2,1,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,3,3,3,3,3,1,0,3,1,0,3,3,3,3,3,3,0,3,0,
- 3,3,0,3,3,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,0,3,
- 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,0,3,3,3,3,0,1,3,1,0,3,3,3,3,3,3,0,3,0,
- 3,3,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,1,3,1,3,3,1,1,3,1,0,1,3,3,3,3,3,2,3,3,
- 3,3,0,2,0,2,2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,
- 2,2,3,2,3,1,3,3,2,3,3,3,3,0,3,0,0,0,1,0,3,
- 2,0,2,0,3,3,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,1,
- 3,3,3,3,2,3,3,2,0,3,0,0,1,3,3,3,3,3,2,3,0,
- 3,3,0,3,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,1,3,0,3,0,0,1,3,0,2,1,3,0,3,2,3,1,3,0,
- 3,3,0,3,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
- 3,3,1,3,2,3,1,1,1,3,0,1,2,2,1,3,3,3,1,3,1,
- 3,3,1,2,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0,
- 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,2,3,3,3,2,3,3,3,3,3,0,
- 0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,2,3,3,3,3,3,3,3,3,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,3,2,3,3,3,2,3,3,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,1,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0,
- 0,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,
- 0,0,0,0,1,1,0,0,0,0,2,2,2,3,3,3,3,3,3,3,3,0,
- 0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,2,2,3,3,3,2,3,1,0,
- 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,1,1,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,1,3,2,2,3,2,3,0,
- 0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,2,3,3,2,1,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,2,3,3,2,1,2,1,2,0,
- 3,3,1,2,0,3,0,0,0,3,0,0,0,3,0,3,2,2,0,1,0,
- 2,3,0,1,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,0,3,3,3,0,2,
+ 3,3,3,1,3,3,3,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,
+ 3,3,3,2,3,3,3,3,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,2,3,2,3,3,3,1,2,3,3,3,3,3,3,1,1,3,3,3,
+ 1,3,2,3,1,1,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,1,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,
+ 3,1,3,3,3,3,3,3,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,3,
+ 3,3,3,3,3,3,3,2,3,3,2,2,3,3,1,3,3,3,3,3,3,3,3,
+ 2,3,3,3,1,1,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,1,
+ 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,3,3,3,3,
+ 3,3,3,3,3,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,3,2,3,3,3,0,3,3,3,3,3,3,1,3,3,3,3,
+ 3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,3,0,3,2,3,3,3,0,3,3,3,0,3,0,0,0,3,3,1,1,0,
+ 3,1,3,2,3,3,3,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,3,
+ 3,3,3,3,3,3,0,3,1,3,3,1,0,3,3,3,3,3,0,0,3,3,3,
+ 0,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,
+ 3,3,2,3,3,3,1,1,1,3,3,1,3,3,1,3,3,3,1,0,3,3,3,
+ 0,3,0,3,3,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,3,3,3,3,3,
+ 3,0,3,2,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,2,3,3,1,2,3,3,2,3,3,3,3,3,3,2,3,3,3,3,
+ 1,3,3,3,3,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,3,0,
+ 3,0,3,0,2,3,3,3,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,
+ 1,3,3,2,3,2,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,1,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,0,3,3,3,3,3,0,3,0,0,3,3,3,3,3,3,3,2,3,3,3,
+ 3,3,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,1,3,2,3,0,3,0,0,1,3,1,3,3,3,0,0,3,3,3,
+ 0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,0,3,3,3,1,3,0,0,0,3,3,0,3,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,1,3,3,3,1,3,3,3,0,3,0,1,2,3,3,0,1,3,
+ 2,0,3,1,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,3,1,3,1,3,3,3,0,3,3,3,0,3,0,0,0,3,3,0,0,0,
+ 2,0,3,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,0,3,0,3,1,0,2,3,3,3,3,3,0,0,3,3,3,
+ 0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0,
+ 3,0,3,0,3,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,0,3,3,3,0,3,0,3,0,0,3,3,2,2,3,3,0,0,3,3,3,
+ 0,3,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,1,3,3,3,2,3,3,3,0,3,0,0,0,2,3,0,3,0,
+ 2,0,3,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,0,3,1,3,0,0,1,3,3,3,3,3,1,0,3,3,3,
+ 0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,0,3,0,3,0,3,0,3,0,1,0,2,3,3,3,3,2,2,3,2,3,
+ 0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,0,3,0,0,0,3,0,0,0,3,1,3,2,3,0,0,3,3,3,
+ 0,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,1,3,1,3,0,0,0,3,0,0,1,3,0,3,3,2,0,0,3,3,3,
+ 0,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,2,3,3,3,2,3,3,3,3,3,2,3,3,3,3,0,
+ 0,1,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0,
+ 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0,
+ 0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,2,3,2,3,1,3,3,2,1,0,
+ 0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,3,1,3,2,3,3,1,3,2,3,0,
+ 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,2,3,1,3,2,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,0,3,3,2,1,3,3,2,2,0,
+ 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,2,2,3,3,2,3,1,3,2,2,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,2,3,3,2,2,1,2,3,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,2,3,1,3,1,1,2,3,2,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,2,3,2,3,3,3,3,3,1,3,1,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,2,1,1,1,2,0,1,3,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,3,1,3,2,3,2,2,0,
+ 3,3,0,2,0,3,0,0,0,3,0,0,0,3,0,3,2,3,0,0,0,2,2,
+ 0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
@@ -226,8 +257,8 @@ const SequenceModel Iso_8859_7GreekModel =
{
Iso_8859_7_CharToOrderMap,
GreekLangModel,
- 43,
- (float)0.9990009349577684,
+ 47,
+ (float)0.9990052068735468,
PR_FALSE,
"ISO-8859-7",
"el"
@@ -237,22 +268,33 @@ const SequenceModel Windows_1253GreekModel =
{
Windows_1253_CharToOrderMap,
GreekLangModel,
- 43,
- (float)0.9990009349577684,
+ 47,
+ (float)0.9990052068735468,
PR_FALSE,
"WINDOWS-1253",
"el"
};
+const SequenceModel Cp737GreekModel =
+{
+ Cp737_CharToOrderMap,
+ GreekLangModel,
+ 47,
+ (float)0.9990052068735468,
+ PR_FALSE,
+ "CP737",
+ "el"
+};
+
const LanguageModel GreekModel =
{
"el",
Unicode_CharOrder,
- 86,
+ 94,
GreekLangModel,
- 43,
+ 47,
6,
- (float)0.440545731568196,
- 27,
- (float)0.03465402569480207,
+ (float)0.4332502995419538,
+ 31,
+ (float)0.03350734517136193,
};
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index b0aa01a..9bf3ad3 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -69,6 +69,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Cp737GreekModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BulgarianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BulgarianModel);
diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h
index fa54561..86dbae5 100644
--- a/src/nsSBCharSetProber-generated.h
+++ b/src/nsSBCharSetProber-generated.h
@@ -38,7 +38,7 @@
#ifndef nsSingleByteCharSetProber_generated_h__
#define nsSingleByteCharSetProber_generated_h__
-#define NUM_OF_SEQUENCE_MODELS 115
+#define NUM_OF_SEQUENCE_MODELS 116
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Windows_1256ArabicModel;
@@ -64,6 +64,7 @@ extern const SequenceModel Windows_1252GermanModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
+extern const SequenceModel Cp737GreekModel;
extern const SequenceModel Iso_8859_1EnglishModel;
extern const SequenceModel Windows_1252EnglishModel;
diff --git a/test/el/cp737.txt b/test/el/cp737.txt
new file mode 100644
index 0000000..4c8c14c
--- /dev/null
+++ b/test/el/cp737.txt
@@ -0,0 +1 @@
+ 櫘 夘 ⤦ ࡫ 磜 ⩩ 回 ⤦ Marmota, 餫 樜 . ᭦ 回 夘 婫 ᫦ .