summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md1
-rw-r--r--script/BuildLangModelLogs/LangGreekModel.log436
-rw-r--r--script/charsets/cp737.py79
-rw-r--r--script/langs/el.py4
-rw-r--r--src/LangModels/LangGreekModel.cpp344
-rw-r--r--src/nsSBCSGroupProber.cpp1
-rw-r--r--src/nsSBCharSetProber-generated.h3
-rw-r--r--test/el/cp737.txt1
8 files changed, 489 insertions, 380 deletions
diff --git a/README.md b/README.md
index 0b85469..ab200bb 100644
--- a/README.md
+++ b/README.md
@@ -86,6 +86,7 @@ uchardet started as a C language binding of the original C++ implementation of t
* UTF-8
* ISO-8859-7
* WINDOWS-1253
+ * CP737
* Hebrew
* UTF-8
* ISO-8859-8
diff --git a/script/BuildLangModelLogs/LangGreekModel.log b/script/BuildLangModelLogs/LangGreekModel.log
index f81f77f..6a8a092 100644
--- a/script/BuildLangModelLogs/LangGreekModel.log
+++ b/script/BuildLangModelLogs/LangGreekModel.log
@@ -1,247 +1,231 @@
= Logs of language model for Greek (el) =
- Generated by BuildLangModel.py
-- Started: 2022-12-14 23:56:52.996274
+- Started: 2022-12-18 20:25:01.002309
- Maximum depth: 4
- Max number of pages: 200
== Parsed pages ==
-Πύλη:Κύρια (revision 9720674)
-Θάνατος της Μάχσα Αμίνι (revision 9785479)
-Ιράκ (revision 9784253)
-5ος αιώνας π.Χ. (revision 9503435)
-1960 (revision 9026602)
-4ος αιώνας π.Χ. (revision 9500049)
-Σάντα Μαρία ντελ Πόπολο (revision 9813223)
-Ρίσι Σούνακ (revision 9807035)
-Γαλλία (revision 9809487)
-Γενικός Γραμματέας του Κομμουνιστικού Κόμματος της Κίνας (revision 9790632)
-Ολυμπιακοί Αγώνες (revision 9767748)
-Αμπέμπε Μπικίλα (revision 9629134)
-Κεντρική Αμερική (revision 9436648)
-Ζήνων ο Ελεάτης (revision 9687689)
-25 Απριλίου (revision 9528548)
-Ιαπωνία (revision 9738909)
-Η Σταύρωση του Αγίου Πέτρου (revision 8027915)
-Δόλιχος (revision 9642281)
-Κατάρ (revision 9777831)
-Καραϊβική (revision 9776894)
-Πρωτεύουσα της Γαλλίας (revision 9694896)
-16 Οκτωβρίου (revision 9719896)
-Επισκοπή της Ρώμης (revision 8247187)
-8ος αιώνας π.Χ. (revision 9509177)
-1516 (revision 8891470)
-Συριακή Ορθόδοξη Εκκλησία (revision 8814522)
-Θεόφραστος (revision 9645199)
-16 Σεπτεμβρίου (revision 9054145)
-Ατλαντικός Ωκεανός (revision 9450595)
-Ηνωμένο Βασίλειο (revision 9738607)
-Καναδάς (revision 9736902)
-Νότια Ασία (revision 9718470)
-Άρθουρ Γουέλσλι, δούκας του Ουέλλινγκτον (revision 9810101)
-Βιετνάμ (revision 9767839)
-Μεροβίγγειοι (revision 9720122)
-Ντομινίκ Γουίλκινς (revision 9798740)
-BIBSYS (revision 9155553)
-Γουατεμάλα (revision 9770327)
-Αγγλική γλώσσα (revision 9779698)
-Αρκτικός Ωκεανός (revision 9450607)
-Αθλητισμός (revision 9816520)
-Δυτική Ευρώπη (revision 9667409)
-Αγγλία (revision 9730532)
-Λεωτυχίδας ο Λακεδαιμόνιος (revision 9655599)
-Καινή Διαθήκη (revision 9725581)
-1660 (revision 7905687)
-Ευρασία-Αφρική (revision 9667364)
-Άμστερνταμ (revision 9701304)
-Πολυνησία (revision 9667374)
-Άρσης (revision 8381416)
-Τασκένδη (revision 9674027)
-Système universitaire de documentation (revision 9519040)
-1964 (revision 9811809)
-COVID-19 (revision 9751162)
-Σεισμός (revision 9555986)
-Μαρόκο (revision 9816451)
-Ευρωπαϊκή Ένωση (revision 9807037)
-Αρχαιοελληνική πυγμαχία (revision 9008193)
-Άντονι Ήντεν (revision 9752041)
-11 Αυγούστου (revision 8988727)
-2006 (revision 9797947)
+Πρωτεύοντα (revision 9792164)
+Ευαρχοντομυωξοί (revision 9475530)
+Φολιδωτά (θηλαστικά) (revision 8966182)
+Ανθρώπινη εξέλιξη (revision 9731824)
+Υδατάνθρακες (revision 9276169)
+Άνθρωπος (revision 9804050)
+National Library of the Czech Republic (revision 9499518)
+Ταξινομία (revision 6174527)
+Δεοξυριβόζη (revision 9735675)
+Συστηματική ταξινόμηση (revision 9163863)
+Οικογένεια (βιολογία) (revision 8380547)
+Μονοσακχαρίτης (revision 8520367)
+Ευλιπότυφλα (revision 8635098)
+Γαλάγος (revision 9624211)
+Ανθρωποειδή (revision 9802784)
+Μυρμήγκι (revision 9743672)
+Primates (revision 9792164)
+Εθνική Βιβλιοθήκη της Μποτσουάνα (revision 9771961)
+Εθνική Βιβλιοθήκη της Σλοβακίας (revision 9545464)
+Κίνα (revision 9794230)
+Μονοσακχαρίτες (revision 8520367)
+Άνθρακας (revision 9698608)
+Τερμίτης (revision 8570600)
+Virtual International Authority File (revision 9547787)
+Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547)
+International Union for Conservation of Nature (revision 9555075)
+Neogene (revision 7970278)
+Ανθρωπoειδή (revision 9802784)
+Λάρυγγας (revision 8037233)
+Θηλαστικά (revision 9802762)
+IUCN Red List (revision 9104016)
+Δισακχαρίτης (revision 9301054)
+Ινσουλίνη (revision 9193560)
+Αρχαϊκοί Homo sapiens (revision 9496339)
+Εθνική Βιβλιοθήκη της Μοζαμβίκης (revision 9771960)
+Εθνική Βιβλιοθήκη της Πολωνίας (revision 9771967)
+Ολιγοσακχαρίτης (revision 9784937)
+Θεσμός (revision 9409922)
+Μοριακό βάρος (revision 8588261)
+Παράνθρωποι (revision 9187211)
+Χρονολόγιο της ανθρώπινης εξέλιξης (revision 9494488)
+Κοινός πρόγονος (revision 7955205)
+Ασία (revision 9640488)
+Εθνική Βιβλιοθήκη του Βανουάτου (revision 9510031)
+Συνομοταξία (revision 8090691)
+Διαδίκτυο (revision 9818610)
+Τριγλυκερίδιο (revision 8991916)
+Εθνική Βιβλιοθήκη της Λετονίας (revision 9736743)
+Εθνική βιβλιοθήκη της Σουηδίας (revision 9741133)
+Ζώα (revision 9797988)
+Απειλούμενα είδη (revision 9387012)
+Εθνική Βιβλιοθήκη της Μιανμάρ (revision 9771959)
+Silurian (revision 7083264)
+Γερμανική γλώσσα (revision 9768836)
+Ζωολογία (revision 9597532)
+Σπονδυλωτά (revision 8936763)
+Χορδωτά (revision 9800855)
+Εθνική Βιβλιοθήκη του Ελ Σαλβαδόρ (revision 9608126)
+Μακρομόρια (revision 8962637)
+Homo sapiens (revision 9804050)
+Γλυκίδια (revision 8976376)
Κατάλογος καθιερωμένων όρων (revision 9747802)
-Ρωσία (revision 9756811)
-Μπιτς βόλεϊ (revision 9629816)
-Αραβική γλώσσα (revision 9758388)
-7ος αιώνας π.Χ. (revision 9509175)
+Paleogene (revision 7772183)
+Γένος (βιολογία) (revision 8620951)
+Βραδυποδόμορφα (revision 8793874)
+Εθνική Βιβλιοθήκη της Ουκρανίας (revision 9818749)
+Περίοδος (γεωλογία) (revision 9598229)
+Γραμμομόριο (revision 9175982)
+Νουκλεϊκά οξέα (revision 9020237)
+Γάλα (revision 9473543)
+Μετάλλαξη (revision 9662655)
+Γαλακτόζη (revision 8983758)
+Φάλαινα (revision 9455804)
+Εθνική Βιβλιοθήκη της Ισπανίας (revision 9771953)
+Γλυκογόνο (revision 8033277)
+Ισπανική γλώσσα (revision 9751022)
+Φωνητικές χορδές (revision 9179304)
+Κανονικές συνθήκες (revision 9776846)
+Άλπεις (revision 9759633)
+Αντίδραση συμπύκνωσης (revision 8965637)
+National Diet Library (revision 9533181)
+Εθνική Βιβλιοθήκη της Βραζιλίας (revision 9516238)
+Homo sapiens sapiens (revision 9804050)
+Εθνική Βιβλιοθήκη των Μπαρμπάντος (revision 9608141)
+Μουντάνεουμ (revision 9387431)
+Ζώο (revision 9797988)
+Εθνική και Πανεπιστημιακή Βιβλιοθήκη της Ισλανδίας (revision 9510045)
+Μόριο (revision 9737689)
+Εθνική Βιβλιοθήκη της Ανδόρας (revision 9771949)
+Βασίλειο (βιολογία) (revision 9171746)
+Εθνική και Πανεπιστημιακή Βιβλιοθήκη «Άγιος Κλήμης της Αχρίδας» (revision 9608210)
+Κλίμα (revision 9262599)
+Δακτυλιοσκώληκες (revision 8985128)
+Ασπάλακας (revision 9429446)
+Μόλυνση (revision 8512424)
International Standard Name Identifier (revision 6861942)
-Ελευθερία, Ισότητα, Αδελφοσύνη (revision 8591437)
-Συμβούλιο των Αντιπροσώπων του Ιράκ (revision 9812554)
-Βιοεπιστήμες (revision 9175912)
-21 Οκτωβρίου (revision 9123772)
-Καμήλα (revision 9815157)
-1526 (revision 7905955)
-Μικτή οικονομία (revision 9756694)
-Εμπραχίμ Ραΐσι (revision 9662244)
-Βαθυσκάφος (revision 8755412)
-Γιοχάνεσμπουργκ (revision 9234192)
-Γιαζίντι (revision 9251594)
-Πακιστάν (revision 9719399)
-27 Νοεμβρίου (revision 9168386)
-25 Οκτωβρίου (revision 9732587)
-Ειρηνικός Ωκεανός (revision 9659005)
-Βία κατά των γυναικών (revision 9404071)
-Γιανίκ Νοά (revision 9721039)
-Μεξικό (revision 9672139)
-Ευρώπη (revision 9806540)
-Ακκάδιοι (revision 8546428)
-Είλωτες (revision 9472621)
-Λεύκιππος (φιλόσοφος) (revision 8933581)
-Δόλοπες (revision 9091900)
-Ήλιος (revision 9797813)
-Αντίς Αμπέμπα (revision 9703571)
-4 Απριλίου (revision 9797052)
-Μεγασθένης (revision 9397713)
-Ισλαμική Συμβουλευτική Συνέλευση (revision 6895099)
-Αμχαρική γλώσσα (revision 8252762)
-Προφήτης Ιωνάς (revision 8981060)
-Μεσοποταμία (revision 9478563)
-Deutsche Welle (revision 9697594)
-Θεοδόσιος Α' (revision 9717330)
-Τανζανία (revision 9672789)
-Μπαρόκ (revision 9498929)
-Διαδηλώσεις στο Ιράν για το θάνατο της Μάχσα Αμίνι (revision 9746434)
-Ρώμη (revision 9684590)
-Εκκλησία (αρχιτεκτονική) (revision 9466423)
-Integrated Authority File (revision 8518544)
-Μπραζίλια (revision 9696231)
-2η χιλιετία (revision 9650679)
-Ιράν (revision 9804479)
-Ολυμπιονίκης (revision 9767748)
-The Guardian (revision 9533576)
-Ανεξαρτησία (revision 9730220)
-Σεισμός στο Αγκαντίρ (1960) (revision 9042146)
-Γκέμπχαρντ Λέμπερεχτ φον Μπλύχερ (revision 9711616)
-Αντρές Μανουέλ Λόπες Ομπραδόρ (revision 9591461)
-Ηνωμένα Έθνη (revision 9596090)
-Καρλομάγνος (revision 9776930)
-Διάδης ο Πελλαίος (revision 8453992)
-Ισλάμ (revision 9535017)
-International Standard Serial Number (revision 9426410)
-Πολ Βερλέν (revision 9620098)
-Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547)
-Παντζάμποι (revision 9410265)
-Θερινοί Ολυμπιακοί Αγώνες 2004 (revision 9646436)
-Κεντρική Αφρική (revision 9666820)
-1695 (revision 7905645)
-1841 (revision 9476734)
-Καράτζ (revision 7678423)
-Αρχιτεκτονική (revision 9699724)
-Κάλι Γιούγκα (revision 9173397)
-Νέπιντο (revision 9786205)
-Ησαΐας (revision 9285628)
-Μιγκέλ Ιδάλγο ι Κοστίγια (revision 6788184)
-Λατινικά (revision 9751029)
-Ριγιάλ του Κατάρ (revision 9194726)
-Καπιταλισμός (revision 9353276)
-Σύστημα ταξινόμησης βιβλιοθήκης (revision 9648804)
-Εκαταίος ο Μιλήσιος (revision 9332195)
-Χαντίθ (revision 9423173)
-24 Οκτωβρίου (revision 9123843)
-Ατρείδες (revision 9269517)
-Νερό (revision 9799579)
-Χριστιανισμός στην Συρία (revision 9504011)
+Υδροξυλομάδα (revision 9719647)
+Εθνική Βιβλιοθήκη του Κουβέιτ (revision 9511761)
+Homo rhodesiensis (revision 7605622)
+Αγγλική γλώσσα (revision 9779698)
+Περιβαλλοντική εκπαίδευση (revision 7971138)
+Γουανίνη (revision 8392293)
+Γριβάδι (revision 9370003)
+Διεθνής Επιτροπή Στρωματογραφίας (revision 9796210)
+Εχινόδερμα (revision 9101031)
+Εθνική Βιβλιοθήκη των Φιλιππινών (revision 9511751)
+Αρτίγονος (revision 9753577)
+Εθνική Βιβλιοθήκη της Σρι Λάνκα (revision 9511705)
+Περιβαλλοντικά προβλήματα (revision 9555971)
+Υπερτάξη (revision 7554395)
+Κατάλογος αντιστοιχίας Λατινικών-Ελληνικών όρων ταξινομικών μονάδων (revision 9562399)
+Κόκκινος κατάλογος της IUCN (revision 9104016)
+Κοινοβουλευτική Βιβλιοθήκη της Γεωργίας (revision 9508234)
+Ασπόνδυλα (revision 9049085)
+Τάξη (βιολογία) (revision 7554395)
+Γρυλοβλαττοειδή (revision 6401187)
+Γλυκόζη (revision 9770284)
+Τουρκικές γλώσσες (revision 9284882)
+Εκπνοή (revision 9611418)
+Ανθρωπίνοι (revision 9103976)
+Εθνική Βιβλιοθήκη του Μαυρικίου (revision 9736776)
+Σαρκοφάγα (revision 8222140)
+Χημική ένωση (revision 9478321)
+Νουκλεοτίδια (revision 8520133)
+Πλειστόκαινο (revision 9225169)
+Υποοικογένεια (revision 8380547)
+Πόδι (έντομα) (revision 7865328)
+Δημόσια Βιβλιοθήκη Τσαρλς Α. Χάλμπερτ (revision 9607718)
+Δισακχαρίτες (revision 9301054)
+Νορβηγική γλώσσα (revision 9527903)
+Σορβόζη (revision 9702780)
+Bibliothèque nationale de France (revision 9636186)
+1778 (revision 9509259)
+Αμυλοπηκτίνη (revision 7348804)
+Υφομοταξία (revision 9796614)
+Κοινή καταγωγή (revision 7955205)
+Βιβλιοθήκη του Βατικανού (revision 9791596)
+Κράμα (revision 8491814)
+Orrorin tugenensis (revision 8021796)
+Εθνική Βιβλιοθήκη της Γερμανίας (revision 9533197)
Εθνική Βιβλιοθήκη της Ελλάδος (revision 9771951)
-Οθωμανικός Στρατός (revision 8724392)
-Κάτεγατ (revision 9501508)
-Βερμούδες (revision 9767755)
-Floruit (revision 9328997)
-Τρανσυλβανία (revision 9743855)
-Κρυπτεία (revision 9689259)
-Δημοκρατία της Ιρλανδίας (revision 9672626)
-Κούρδοι (revision 9814315)
-Οθέλλος (revision 9168875)
-Κώμα (revision 7363577)
-Συρία (revision 9703382)
-Ευρασία (revision 9667362)
-Θάλασσα του Σολομώντα (revision 9466850)
-Πόλη του Μεξικού (revision 9692487)
-Σημιτικές γλώσσες (revision 9595481)
-Ισραήλ (revision 9777728)
-Ναβουχοδονόσορ Β' (revision 9785243)
-Αρδέννες (revision 9438776)
-Λοτζ (revision 9812741)
-Παρθένοι Νήσοι (revision 9466868)
-Κεντρική Ευρώπη (revision 9712626)
-Λάπις λάζουλι (revision 9356278)
-Πόλεμος του Έβδομου Συνασπισμού (revision 9526274)
-Βέλγιο (revision 9785761)
-Γκιμαράες (revision 9256928)
-Σπονδυλική στήλη (revision 9772196)
-Κομμούνα του Παρισιού (Γαλλική επανάσταση) (revision 8737961)
-Κοινοπολιτεία των Εθνών (revision 9713895)
-Τηλεμετρία (revision 9300418)
-Β΄ Παγκόσμιος Πόλεμος (revision 9796929)
-Σουηδία (revision 9724663)
+Κάρολος Λινναίος (revision 9170651)
+Εθνική Βιβλιοθήκη της Μαυριτανίας (revision 9771958)
+Εθνική Βιβλιοθήκη της Σαουδικής Αραβίας (revision 9777111)
+Εθνική Βιβλιοθήκη της Ιορδανίας (revision 9510012)
+Κλαδιστική (revision 7593647)
+Κετόζες (revision 9015709)
+Υδροξύλιο (revision 9719647)
+Απειλούμενο είδος (revision 9387012)
+Νέφος (revision 9753949)
+Κατάρρινοι (revision 9802799)
+Επικοινωνία (revision 9810024)
+Χημικός τύπος (revision 9478340)
+Εθνικά Αρχεία και Βιβλιοθήκη της Αιθιοπίας (revision 9608078)
+Ολιγόκαινος εποχή (revision 8882927)
== End of Parsed pages ==
-- Wikipedia parsing ended at: 2022-12-15 00:00:15.230612
+- Wikipedia parsing ended at: 2022-12-18 20:30:49.244663
-63 characters appeared 1687423 times.
+62 characters appeared 918903 times.
Most Frequent characters:
-[ 0] Char α: 9.291090615690315 %
-[ 1] Char ο: 8.043092929277366 %
-[ 2] Char τ: 7.9854310389274055 %
-[ 3] Char ι: 6.7272995567797755 %
-[ 4] Char ν: 6.033816061532882 %
-[ 5] Char ε: 5.973842954611855 %
-[ 6] Char ρ: 4.455077357603873 %
-[ 7] Char σ: 4.30638909153188 %
-[ 8] Char κ: 4.299455441818679 %
-[ 9] Char η: 3.817951989513003 %
-[10] Char ς: 3.5992160827486646 %
-[11] Char π: 3.4671804283810284 %
-[12] Char μ: 3.293483613770821 %
-[13] Char υ: 3.188708462549106 %
-[14] Char λ: 2.825551151074745 %
-[15] Char ί: 2.374330562046387 %
-[16] Char ό: 1.996061449915048 %
-[17] Char ά: 1.9162948472315477 %
-[18] Char γ: 1.7252935393200164 %
-[19] Char έ: 1.6340893777079015 %
-[20] Char δ: 1.4231167881438145 %
-[21] Char ω: 1.3993527408361743 %
-[22] Char ή: 1.3272309314262043 %
-[23] Char χ: 1.1665717487553506 %
-[24] Char ύ: 1.0015271807958053 %
-[25] Char θ: 0.9180863363839417 %
-[26] Char β: 0.8104666109209131 %
-[27] Char ώ: 0.779828175863432 %
-[28] Char φ: 0.699231905692882 %
-[29] Char ξ: 0.37246143972199025 %
-[30] Char ζ: 0.29607276895004986 %
-[31] Char e: 0.2829166130839748 %
-[32] Char a: 0.25891551792289186 %
-[33] Char i: 0.21506166503597496 %
-[34] Char n: 0.19153466558177767 %
-[35] Char r: 0.19141614165505627 %
-[36] Char o: 0.18294168089447638 %
-[37] Char s: 0.17677843670496374 %
-[38] Char t: 0.15899984769675415 %
-[39] Char l: 0.12883550834615862 %
-[40] Char c: 0.10969389418065299 %
-[41] Char d: 0.10281950643081196 %
-[42] Char ψ: 0.09772297758179188 %
+[ 0] Char α: 9.042630179681641 %
+[ 1] Char ο: 7.761537398397872 %
+[ 2] Char τ: 7.389680956531865 %
+[ 3] Char ι: 7.071584269503963 %
+[ 4] Char ν: 6.1224090029089036 %
+[ 5] Char ε: 5.937188147171138 %
+[ 6] Char κ: 4.257359046602308 %
+[ 7] Char ρ: 4.217311294010358 %
+[ 8] Char σ: 4.050373107934135 %
+[ 9] Char η: 3.6424954538183028 %
+[10] Char π: 3.53171118170253 %
+[11] Char ς: 3.3343018795237365 %
+[12] Char μ: 3.2733596473185957 %
+[13] Char υ: 3.02023173283796 %
+[14] Char λ: 2.6589313561932 %
+[15] Char ί: 2.381426548830508 %
+[16] Char ό: 1.9545044471505697 %
+[17] Char ά: 1.8594998601593422 %
+[18] Char γ: 1.7558980654106038 %
+[19] Char δ: 1.6237840120230318 %
+[20] Char έ: 1.569806606355622 %
+[21] Char ω: 1.5474973963519545 %
+[22] Char ή: 1.323969994656672 %
+[23] Char χ: 1.1194870405254962 %
+[24] Char ύ: 1.0730185884690766 %
+[25] Char θ: 1.0217618181679675 %
+[26] Char ώ: 0.7902901612030867 %
+[27] Char φ: 0.7704839357364162 %
+[28] Char β: 0.7675456495408112 %
+[29] Char ξ: 0.4437900409510035 %
+[30] Char ζ: 0.4305133403634551 %
+[31] Char a: 0.4036334629444022 %
+[32] Char e: 0.39601568391875963 %
+[33] Char i: 0.3618445037180203 %
+[34] Char n: 0.3161378295641651 %
+[35] Char o: 0.31026125717295516 %
+[36] Char s: 0.2842519830711185 %
+[37] Char r: 0.2519308349194638 %
+[38] Char t: 0.23560702272165832 %
+[39] Char l: 0.20110936627696285 %
+[40] Char c: 0.19925933422787825 %
+[41] Char h: 0.1609527882703615 %
+[42] Char d: 0.14419367441394795 %
+[43] Char u: 0.13690237163226152 %
+[44] Char m: 0.1365758953883054 %
+[45] Char p: 0.11328725665276966 %
+[46] Char ψ: 0.10240471518756604 %
-The first 43 characters have an accumulated ratio of 0.9924523963463813.
-The first 6 characters have an accumulated ratio of 0.440545731568196.
-All characters whose order is over 27 have an accumulated ratio of 0.03465402569480207.
+The first 47 characters have an accumulated ratio of 0.9949875014011275.
+The first 6 characters have an accumulated ratio of 0.4332502995419538.
+All characters whose order is over 31 have an accumulated ratio of 0.03350734517136193.
-1515 sequences found.
+1389 sequences found.
-First 852 (typical positive ratio): 0.9950198012242328
-Next 229 (1081-852): 0.003981133733535591
-Rest: 0.0009990650422315728
+First 849 (typical positive ratio): 0.9950207709120384
+Next 223 (1072-849): 0.003984435961508326
+Rest: 0.0009947931264532306
-- Processing end: 2022-12-15 00:00:15.353968
+- Processing end: 2022-12-18 20:30:49.348223
diff --git a/script/charsets/cp737.py b/script/charsets/cp737.py
new file mode 100644
index 0000000..be9c4bc
--- /dev/null
+++ b/script/charsets/cp737.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# ##### BEGIN LICENSE BLOCK #####
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+#
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+#
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+#
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+# Jehan <jehan@girinstud.io>
+#
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+#
+# ##### END LICENSE BLOCK #####
+
+from codepoints import *
+
+# NOTE: I use CP737 and not IBM737 as the main encoding name, since iconv
+# conversion failed with IBM737 with the file from #21 and in BuildLangModel.py
+# script, even though these are supposed to be synonyms.
+name = 'CP737'
+aliases = ['IBM737', 'OEM 737', 'MS-DOS Greek']
+
+language = \
+{
+ # Wikipedia tells us: Code page 737 (CCSID 737) (also known as CP 737,
+ # IBM 00737, and OEM 737, MS-DOS Greek) is a code page used under DOS to
+ # write the Greek language.[4] It was much more popular than code page
+ # 869 although it lacks the letters ΐ and ΰ.
+ 'complete': [ 'el' ],
+ 'incomplete': []
+}
+
+# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF #
+charmap = \
+[
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X
+ SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X
+
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX
+ LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX
+ LET,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX
+]
diff --git a/script/langs/el.py b/script/langs/el.py
index 4c8352b..8bc6406 100644
--- a/script/langs/el.py
+++ b/script/langs/el.py
@@ -45,11 +45,11 @@ import re
name = 'Greek'
code = 'el'
use_ascii = False
-charsets = ['ISO-8859-7', 'WINDOWS-1253']
+charsets = ['ISO-8859-7', 'WINDOWS-1253', 'CP737']
## Optional Properties ##
alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
-start_pages = ['Πύλη:Κύρια']
+start_pages = ['Πρωτεύοντα']
wikipedia_code = code
case_mapping = True
diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp
index 4825977..14ada24 100644
--- a/src/LangModels/LangGreekModel.cpp
+++ b/src/LangModels/LangGreekModel.cpp
@@ -38,50 +38,51 @@
#include "../nsSBCharSetProber.h"
#include "../nsSBCharSetProber-generated.h"
#include "../nsLanguageDetector.h"
+
#include "../nsLanguageDetector-generated.h"
/********* Language model for: Greek *********/
/**
* Generated by BuildLangModel.py
- * On: 2022-12-15 00:00:15.231612
+ * On: 2022-12-18 20:30:49.245016
**/
-/* Character Mapping Table:
- * ILL: illegal character.
- * CTR: control character specific to the charset.
- * RET: carriage/return.
- * SYM: symbol (punctuation) that does not belong to word.
- * NUM: 0 - 9.
- *
- * Other characters are ordered by probabilities
- * (0 is the most common character in the language).
- *
- * Orders are generic to a language. So the codepoint with order X in
- * CHARSET1 maps to the same character as the codepoint with the same
- * order X in CHARSET2 for the same language.
- * As such, it is possible to get missing order. For instance the
- * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
- * even though they are both used for French. Same for the euro sign.
- */
-static const unsigned char Iso_8859_7_CharToOrderMap[] =
+ /* Character Mapping Table:
+ * ILL: illegal character.
+ * CTR: control character specific to the charset.
+ * RET: carriage/return.
+ * SYM: symbol (punctuation) that does not belong to word.
+ * NUM: 0 - 9.
+ *
+ * Other characters are ordered by probabilities
+ * (0 is the most common character in the language).
+ *
+ * Orders are generic to a language. So the codepoint with order X in
+ * CHARSET1 maps to the same character as the codepoint with the same
+ * order X in CHARSET2 for the same language.
+ * As such, it is possible to get missing order. For instance the
+ * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1
+ * even though they are both used for French. Same for the euro sign.
+ */
+ static const unsigned char Iso_8859_7_CharToOrderMap[] =
{
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */
- 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */
- 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */
- 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */
- 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */
+ SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */
+ 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */
+ 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */
+ 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */
+ 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
@@ -91,134 +92,164 @@ static const unsigned char Windows_1253_CharToOrderMap[] =
CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */
- SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */
- 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 8X */
ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 9X */
SYM,SYM, 17,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, /* AX */
- SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */
- 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */
- 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */
- 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */
- 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */
+ SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */
+ 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */
+ 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */
+ 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */
+ 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */
+};
+/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
+
+static const unsigned char Cp737_CharToOrderMap[] =
+{
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */
+ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */
+ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */
+ SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */
+ 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */
+ 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, 10, /* 8X */
+ 7, 8, 2, 13, 27, 23, 46, 21, 0, 28, 18, 19, 5, 30, 9, 25, /* 9X */
+ 3, 6, 14, 12, 4, 29, 1, 10, 7, 8, 11, 2, 13, 27, 23, 46, /* AX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */
+ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */
+ 21, 17, 20, 22, 49, 15, 16, 24, 60, 26, 17, 20, 22, 15, 16, 24, /* EX */
+ 26,SYM,SYM,SYM, 49, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */
};
/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */
-static const int Unicode_Char_size = 86;
+static const int Unicode_Char_size = 94;
static const unsigned int Unicode_CharOrder[] =
{
- 65, 32, 67, 40, 68, 41, 69, 31, 73, 33, 76, 39, 78, 34, 79, 36,
- 82, 35, 83, 37, 84, 38, 97, 32, 99, 40, 100, 41, 101, 31,105, 33,
- 108, 39, 110, 34, 111, 36, 114, 35, 115, 37, 116, 38, 902, 17,904, 19,
- 905, 22, 906, 15, 908, 16, 910, 24, 911, 27, 913, 0, 914, 26,915, 18,
- 916, 20, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 8,923, 14,
- 924, 12, 925, 4, 926, 29, 927, 1, 928, 11, 929, 6, 931, 7,931, 10,
- 932, 2, 933, 13, 934, 28, 935, 23, 936, 42, 937, 21, 940, 17,941, 19,
- 942, 22, 943, 15, 945, 0, 946, 26, 947, 18, 948, 20, 949, 5,950, 30,
- 951, 9, 952, 25, 953, 3, 954, 8, 955, 14, 956, 12, 957, 4,958, 29,
- 959, 1, 960, 11, 961, 6, 962, 10, 963, 7, 964, 2, 965, 13,966, 28,
- 967, 23, 968, 42, 969, 21, 972, 16, 973, 24, 974, 27,
+ 65, 31, 67, 40, 68, 42, 69, 32, 72, 41, 73, 33, 76, 39, 77, 44,
+ 78, 34, 79, 35, 80, 45, 82, 37, 83, 36, 84, 38, 85, 43, 97, 31,
+ 99, 40, 100, 42, 101, 32, 104, 41, 105, 33, 108, 39, 109, 44,110, 34,
+ 111, 35, 112, 45, 114, 37, 115, 36, 116, 38, 117, 43, 902, 17,904, 20,
+ 905, 22, 906, 15, 908, 16, 910, 24, 911, 26, 913, 0, 914, 28,915, 18,
+ 916, 19, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 6,923, 14,
+ 924, 12, 925, 4, 926, 29, 927, 1, 928, 10, 929, 7, 931, 8,931, 11,
+ 932, 2, 933, 13, 934, 27, 935, 23, 936, 46, 937, 21, 940, 17,941, 20,
+ 942, 22, 943, 15, 945, 0, 946, 28, 947, 18, 948, 19, 949, 5,950, 30,
+ 951, 9, 952, 25, 953, 3, 954, 6, 955, 14, 956, 12, 957, 4,958, 29,
+ 959, 1, 960, 10, 961, 7, 962, 11, 963, 8, 964, 2, 965, 13,966, 27,
+ 967, 23, 968, 46, 969, 21, 972, 16, 973, 24, 974, 26,
};
-/* Model Table:
- * Total considered sequences: 1515 / 1849
- * - Positive sequences: first 852 (0.9950198012242328)
- * - Probable sequences: next 229 (1081-852) (0.003981133733535591)
- * - Neutral sequences: last 768 (0.0009990650422315728)
- * - Negative sequences: 334 (off-ratio)
- * Negative sequences: TODO
+ /* Model Table:
+ * Total considered sequences: 1389 / 2209
+ * - Positive sequences: first 849 (0.9950207709120384)
+ * - Probable sequences: next 223 (1072-849) (0.003984435961508326)
+ * - Neutral sequences: last 1137 (0.0009947931264532306)
+ * - Negative sequences: 820 (off-ratio)
+ * Negative sequences: TODO
*/
static const PRUint8 GreekLangModel[] =
{
- 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,
- 0,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,
- 2,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,1,3,3,1,3,3,3,3,3,3,2,3,0,
- 3,3,2,3,3,2,3,2,0,3,0,0,0,0,0,2,0,0,0,0,0,0,
- 3,3,3,2,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3,
- 3,3,3,1,3,3,3,3,3,3,0,0,0,0,0,1,0,0,0,0,0,3,
- 3,3,3,3,3,3,2,3,3,3,3,1,2,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,1,3,2,1,3,0,1,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3,
- 3,2,3,3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,3,3,3,3,3,2,0,0,0,0,0,1,0,0,0,0,0,1,
- 3,3,3,3,2,3,3,3,3,3,0,3,3,3,3,3,3,3,2,3,3,
- 3,3,3,3,3,3,3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,1,3,3,
- 3,3,2,3,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,3,1,3,3,3,3,3,0,3,3,3,0,3,1,0,0,3,1,3,
- 0,0,3,1,3,2,0,3,3,0,1,0,0,0,0,0,0,0,0,0,0,3,
- 1,1,1,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,
- 0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,1,1,3,1,3,0,3,3,3,3,3,1,3,1,
- 3,3,1,3,1,1,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,2,3,3,3,1,3,1,3,2,3,3,3,1,3,3,3,1,3,0,
- 3,3,1,3,0,3,3,3,0,1,0,0,0,0,0,1,1,0,0,0,0,2,
- 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,
- 0,3,3,0,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,3,3,3,3,1,3,3,3,2,3,3,3,3,3,3,3,3,3,3,
- 3,3,2,3,3,3,3,3,1,1,0,0,0,0,0,1,0,0,0,0,0,2,
- 3,3,3,1,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,0,3,
- 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 2,3,3,2,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,0,3,
- 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,1,3,
- 1,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,1,3,3,3,3,0,3,3,0,0,3,3,3,3,3,3,3,3,3,
- 3,3,3,3,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,3,3,3,3,3,3,1,3,3,3,1,3,0,0,0,3,0,3,
- 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,0,3,2,3,3,0,1,3,0,0,1,3,0,3,3,3,1,3,1,
- 3,3,0,3,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,2,3,2,3,1,3,3,3,1,3,3,3,1,3,2,1,3,3,1,3,
- 0,3,2,0,3,3,1,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,
- 1,1,3,2,3,2,3,3,3,1,3,3,3,0,3,0,0,0,3,1,3,
- 0,0,3,0,3,1,0,3,2,1,0,0,0,0,0,0,0,0,0,0,0,2,
- 3,3,3,3,3,3,3,1,0,3,1,0,3,3,3,3,3,3,0,3,0,
- 3,3,0,3,3,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,0,3,
- 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3,
- 3,3,0,3,3,3,3,0,1,3,1,0,3,3,3,3,3,3,0,3,0,
- 3,3,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,1,3,1,3,3,1,1,3,1,0,1,3,3,3,3,3,2,3,3,
- 3,3,0,2,0,2,2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,
- 2,2,3,2,3,1,3,3,2,3,3,3,3,0,3,0,0,0,1,0,3,
- 2,0,2,0,3,3,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,1,
- 3,3,3,3,2,3,3,2,0,3,0,0,1,3,3,3,3,3,2,3,0,
- 3,3,0,3,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 3,3,1,3,0,3,0,0,1,3,0,2,1,3,0,3,2,3,1,3,0,
- 3,3,0,3,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
- 3,3,1,3,2,3,1,1,1,3,0,1,2,2,1,3,3,3,1,3,1,
- 3,3,1,2,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0,
- 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,2,3,3,3,2,3,3,3,3,3,0,
- 0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,2,3,3,3,3,3,3,3,3,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,3,2,3,3,3,2,3,3,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,1,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0,
- 0,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,
- 0,0,0,0,1,1,0,0,0,0,2,2,2,3,3,3,3,3,3,3,3,0,
- 0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,2,2,3,3,3,2,3,1,0,
- 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,1,1,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,1,3,2,2,3,2,3,0,
- 0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,2,3,3,2,1,0,
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
- 0,0,0,0,0,0,0,0,0,0,3,3,3,2,3,3,2,1,2,1,2,0,
- 3,3,1,2,0,3,0,0,0,3,0,0,0,3,0,3,2,2,0,1,0,
- 2,3,0,1,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,0,3,3,3,0,2,
+ 3,3,3,1,3,3,3,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3,
+ 3,3,3,2,3,3,3,3,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,2,3,2,3,3,3,1,2,3,3,3,3,3,3,1,1,3,3,3,
+ 1,3,2,3,1,1,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,1,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3,
+ 3,1,3,3,3,3,3,3,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,3,
+ 3,3,3,3,3,3,3,2,3,3,2,2,3,3,1,3,3,3,3,3,3,3,3,
+ 2,3,3,3,1,1,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,1,
+ 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,3,3,3,3,
+ 3,3,3,3,3,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,3,3,3,3,3,
+ 3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,3,2,3,3,3,0,3,3,3,3,3,3,1,3,3,3,3,
+ 3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,3,0,3,2,3,3,3,0,3,3,3,0,3,0,0,0,3,3,1,1,0,
+ 3,1,3,2,3,3,3,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,3,
+ 3,3,3,3,3,3,0,3,1,3,3,1,0,3,3,3,3,3,0,0,3,3,3,
+ 0,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,
+ 3,3,2,3,3,3,1,1,1,3,3,1,3,3,1,3,3,3,1,0,3,3,3,
+ 0,3,0,3,3,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,3,3,3,3,3,
+ 3,0,3,2,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,2,3,3,1,2,3,3,2,3,3,3,3,3,3,2,3,3,3,3,
+ 1,3,3,3,3,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,
+ 3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,3,0,
+ 3,0,3,0,2,3,3,3,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2,
+ 1,3,3,2,3,2,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,1,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,0,3,3,3,3,3,0,3,0,0,3,3,3,3,3,3,3,2,3,3,3,
+ 3,3,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,1,3,2,3,0,3,0,0,1,3,1,3,3,3,0,0,3,3,3,
+ 0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,3,3,3,3,0,3,3,3,1,3,0,0,0,3,3,0,3,0,
+ 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,3,3,3,1,3,3,3,1,3,3,3,0,3,0,1,2,3,3,0,1,3,
+ 2,0,3,1,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,1,3,1,3,1,3,3,3,0,3,3,3,0,3,0,0,0,3,3,0,0,0,
+ 2,0,3,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,
+ 3,3,3,3,3,3,0,3,0,3,1,0,2,3,3,3,3,3,0,0,3,3,3,
+ 0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 2,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0,
+ 3,0,3,0,3,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,
+ 3,3,0,3,3,3,0,3,0,3,0,0,3,3,2,2,3,3,0,0,3,3,3,
+ 0,3,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,3,1,3,3,3,2,3,3,3,0,3,0,0,0,2,3,0,3,0,
+ 2,0,3,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,3,3,1,3,0,3,1,3,0,0,1,3,3,3,3,3,1,0,3,3,3,
+ 0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,0,3,0,3,0,3,0,3,0,1,0,2,3,3,3,3,2,2,3,2,3,
+ 0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,2,3,0,3,0,0,0,3,0,0,0,3,1,3,2,3,0,0,3,3,3,
+ 0,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 3,3,1,3,1,3,0,0,0,3,0,0,1,3,0,3,3,2,0,0,3,3,3,
+ 0,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,2,3,3,3,2,3,3,3,3,3,2,3,3,3,3,0,
+ 0,1,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,
+ 0,0,1,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0,
+ 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0,
+ 0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,2,3,2,3,1,3,3,2,1,0,
+ 0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,3,1,3,2,3,3,1,3,2,3,0,
+ 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0,
+ 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,2,3,1,3,2,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,0,3,3,2,1,3,3,2,2,0,
+ 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,2,2,3,3,2,3,1,3,2,2,0,
+ 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,2,3,3,2,2,1,2,3,1,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,3,3,2,3,1,3,1,1,2,3,2,1,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,2,3,2,3,3,3,3,3,1,3,1,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,2,3,2,1,1,1,2,0,1,3,3,3,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,3,1,3,2,3,2,2,0,
+ 3,3,0,2,0,3,0,0,0,3,0,0,0,3,0,3,2,3,0,0,0,2,2,
+ 0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
};
@@ -226,8 +257,8 @@ const SequenceModel Iso_8859_7GreekModel =
{
Iso_8859_7_CharToOrderMap,
GreekLangModel,
- 43,
- (float)0.9990009349577684,
+ 47,
+ (float)0.9990052068735468,
PR_FALSE,
"ISO-8859-7",
"el"
@@ -237,22 +268,33 @@ const SequenceModel Windows_1253GreekModel =
{
Windows_1253_CharToOrderMap,
GreekLangModel,
- 43,
- (float)0.9990009349577684,
+ 47,
+ (float)0.9990052068735468,
PR_FALSE,
"WINDOWS-1253",
"el"
};
+const SequenceModel Cp737GreekModel =
+{
+ Cp737_CharToOrderMap,
+ GreekLangModel,
+ 47,
+ (float)0.9990052068735468,
+ PR_FALSE,
+ "CP737",
+ "el"
+};
+
const LanguageModel GreekModel =
{
"el",
Unicode_CharOrder,
- 86,
+ 94,
GreekLangModel,
- 43,
+ 47,
6,
- (float)0.440545731568196,
- 27,
- (float)0.03465402569480207,
+ (float)0.4332502995419538,
+ 31,
+ (float)0.03350734517136193,
};
diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp
index b0aa01a..9bf3ad3 100644
--- a/src/nsSBCSGroupProber.cpp
+++ b/src/nsSBCSGroupProber.cpp
@@ -69,6 +69,7 @@ nsSBCSGroupProber::nsSBCSGroupProber()
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel);
+ mProbers[n++] = new nsSingleByteCharSetProber(&Cp737GreekModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BulgarianModel);
mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BulgarianModel);
diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h
index fa54561..86dbae5 100644
--- a/src/nsSBCharSetProber-generated.h
+++ b/src/nsSBCharSetProber-generated.h
@@ -38,7 +38,7 @@
#ifndef nsSingleByteCharSetProber_generated_h__
#define nsSingleByteCharSetProber_generated_h__
-#define NUM_OF_SEQUENCE_MODELS 115
+#define NUM_OF_SEQUENCE_MODELS 116
extern const SequenceModel Iso_8859_6ArabicModel;
extern const SequenceModel Windows_1256ArabicModel;
@@ -64,6 +64,7 @@ extern const SequenceModel Windows_1252GermanModel;
extern const SequenceModel Iso_8859_7GreekModel;
extern const SequenceModel Windows_1253GreekModel;
+extern const SequenceModel Cp737GreekModel;
extern const SequenceModel Iso_8859_1EnglishModel;
extern const SequenceModel Windows_1252EnglishModel;
diff --git a/test/el/cp737.txt b/test/el/cp737.txt
new file mode 100644
index 0000000..4c8c14c
--- /dev/null
+++ b/test/el/cp737.txt
@@ -0,0 +1 @@
+ 櫘 夘 ⤦ ࡫ 磜 ⩩ 回 ⤦ Marmota, 餫 樜 . ᭦ 回 夘 婫 ᫦ .