diff options
-rw-r--r-- | README.md | 1 | ||||
-rw-r--r-- | script/BuildLangModelLogs/LangGreekModel.log | 436 | ||||
-rw-r--r-- | script/charsets/cp737.py | 79 | ||||
-rw-r--r-- | script/langs/el.py | 4 | ||||
-rw-r--r-- | src/LangModels/LangGreekModel.cpp | 344 | ||||
-rw-r--r-- | src/nsSBCSGroupProber.cpp | 1 | ||||
-rw-r--r-- | src/nsSBCharSetProber-generated.h | 3 | ||||
-rw-r--r-- | test/el/cp737.txt | 1 |
8 files changed, 489 insertions, 380 deletions
@@ -86,6 +86,7 @@ uchardet started as a C language binding of the original C++ implementation of t * UTF-8 * ISO-8859-7 * WINDOWS-1253 + * CP737 * Hebrew * UTF-8 * ISO-8859-8 diff --git a/script/BuildLangModelLogs/LangGreekModel.log b/script/BuildLangModelLogs/LangGreekModel.log index f81f77f..6a8a092 100644 --- a/script/BuildLangModelLogs/LangGreekModel.log +++ b/script/BuildLangModelLogs/LangGreekModel.log @@ -1,247 +1,231 @@ = Logs of language model for Greek (el) = - Generated by BuildLangModel.py -- Started: 2022-12-14 23:56:52.996274 +- Started: 2022-12-18 20:25:01.002309 - Maximum depth: 4 - Max number of pages: 200 == Parsed pages == -Πύλη:Κύρια (revision 9720674) -Θάνατος της Μάχσα Αμίνι (revision 9785479) -Ιράκ (revision 9784253) -5ος αιώνας π.Χ. (revision 9503435) -1960 (revision 9026602) -4ος αιώνας π.Χ. (revision 9500049) -Σάντα Μαρία ντελ Πόπολο (revision 9813223) -Ρίσι Σούνακ (revision 9807035) -Γαλλία (revision 9809487) -Γενικός Γραμματέας του Κομμουνιστικού Κόμματος της Κίνας (revision 9790632) -Ολυμπιακοί Αγώνες (revision 9767748) -Αμπέμπε Μπικίλα (revision 9629134) -Κεντρική Αμερική (revision 9436648) -Ζήνων ο Ελεάτης (revision 9687689) -25 Απριλίου (revision 9528548) -Ιαπωνία (revision 9738909) -Η Σταύρωση του Αγίου Πέτρου (revision 8027915) -Δόλιχος (revision 9642281) -Κατάρ (revision 9777831) -Καραϊβική (revision 9776894) -Πρωτεύουσα της Γαλλίας (revision 9694896) -16 Οκτωβρίου (revision 9719896) -Επισκοπή της Ρώμης (revision 8247187) -8ος αιώνας π.Χ. (revision 9509177) -1516 (revision 8891470) -Συριακή Ορθόδοξη Εκκλησία (revision 8814522) -Θεόφραστος (revision 9645199) -16 Σεπτεμβρίου (revision 9054145) -Ατλαντικός Ωκεανός (revision 9450595) -Ηνωμένο Βασίλειο (revision 9738607) -Καναδάς (revision 9736902) -Νότια Ασία (revision 9718470) -Άρθουρ Γουέλσλι, δούκας του Ουέλλινγκτον (revision 9810101) -Βιετνάμ (revision 9767839) -Μεροβίγγειοι (revision 9720122) -Ντομινίκ Γουίλκινς (revision 9798740) -BIBSYS (revision 9155553) -Γουατεμάλα (revision 9770327) -Αγγλική γλώσσα (revision 9779698) -Αρκτικός Ωκεανός (revision 9450607) -Αθλητισμός (revision 9816520) -Δυτική Ευρώπη (revision 9667409) -Αγγλία (revision 9730532) -Λεωτυχίδας ο Λακεδαιμόνιος (revision 9655599) -Καινή Διαθήκη (revision 9725581) -1660 (revision 7905687) -Ευρασία-Αφρική (revision 9667364) -Άμστερνταμ (revision 9701304) -Πολυνησία (revision 9667374) -Άρσης (revision 8381416) -Τασκένδη (revision 9674027) -Système universitaire de documentation (revision 9519040) -1964 (revision 9811809) -COVID-19 (revision 9751162) -Σεισμός (revision 9555986) -Μαρόκο (revision 9816451) -Ευρωπαϊκή Ένωση (revision 9807037) -Αρχαιοελληνική πυγμαχία (revision 9008193) -Άντονι Ήντεν (revision 9752041) -11 Αυγούστου (revision 8988727) -2006 (revision 9797947) +Πρωτεύοντα (revision 9792164) +Ευαρχοντομυωξοί (revision 9475530) +Φολιδωτά (θηλαστικά) (revision 8966182) +Ανθρώπινη εξέλιξη (revision 9731824) +Υδατάνθρακες (revision 9276169) +Άνθρωπος (revision 9804050) +National Library of the Czech Republic (revision 9499518) +Ταξινομία (revision 6174527) +Δεοξυριβόζη (revision 9735675) +Συστηματική ταξινόμηση (revision 9163863) +Οικογένεια (βιολογία) (revision 8380547) +Μονοσακχαρίτης (revision 8520367) +Ευλιπότυφλα (revision 8635098) +Γαλάγος (revision 9624211) +Ανθρωποειδή (revision 9802784) +Μυρμήγκι (revision 9743672) +Primates (revision 9792164) +Εθνική Βιβλιοθήκη της Μποτσουάνα (revision 9771961) +Εθνική Βιβλιοθήκη της Σλοβακίας (revision 9545464) +Κίνα (revision 9794230) +Μονοσακχαρίτες (revision 8520367) +Άνθρακας (revision 9698608) +Τερμίτης (revision 8570600) +Virtual International Authority File (revision 9547787) +Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547) +International Union for Conservation of Nature (revision 9555075) +Neogene (revision 7970278) +Ανθρωπoειδή (revision 9802784) +Λάρυγγας (revision 8037233) +Θηλαστικά (revision 9802762) +IUCN Red List (revision 9104016) +Δισακχαρίτης (revision 9301054) +Ινσουλίνη (revision 9193560) +Αρχαϊκοί Homo sapiens (revision 9496339) +Εθνική Βιβλιοθήκη της Μοζαμβίκης (revision 9771960) +Εθνική Βιβλιοθήκη της Πολωνίας (revision 9771967) +Ολιγοσακχαρίτης (revision 9784937) +Θεσμός (revision 9409922) +Μοριακό βάρος (revision 8588261) +Παράνθρωποι (revision 9187211) +Χρονολόγιο της ανθρώπινης εξέλιξης (revision 9494488) +Κοινός πρόγονος (revision 7955205) +Ασία (revision 9640488) +Εθνική Βιβλιοθήκη του Βανουάτου (revision 9510031) +Συνομοταξία (revision 8090691) +Διαδίκτυο (revision 9818610) +Τριγλυκερίδιο (revision 8991916) +Εθνική Βιβλιοθήκη της Λετονίας (revision 9736743) +Εθνική βιβλιοθήκη της Σουηδίας (revision 9741133) +Ζώα (revision 9797988) +Απειλούμενα είδη (revision 9387012) +Εθνική Βιβλιοθήκη της Μιανμάρ (revision 9771959) +Silurian (revision 7083264) +Γερμανική γλώσσα (revision 9768836) +Ζωολογία (revision 9597532) +Σπονδυλωτά (revision 8936763) +Χορδωτά (revision 9800855) +Εθνική Βιβλιοθήκη του Ελ Σαλβαδόρ (revision 9608126) +Μακρομόρια (revision 8962637) +Homo sapiens (revision 9804050) +Γλυκίδια (revision 8976376) Κατάλογος καθιερωμένων όρων (revision 9747802) -Ρωσία (revision 9756811) -Μπιτς βόλεϊ (revision 9629816) -Αραβική γλώσσα (revision 9758388) -7ος αιώνας π.Χ. (revision 9509175) +Paleogene (revision 7772183) +Γένος (βιολογία) (revision 8620951) +Βραδυποδόμορφα (revision 8793874) +Εθνική Βιβλιοθήκη της Ουκρανίας (revision 9818749) +Περίοδος (γεωλογία) (revision 9598229) +Γραμμομόριο (revision 9175982) +Νουκλεϊκά οξέα (revision 9020237) +Γάλα (revision 9473543) +Μετάλλαξη (revision 9662655) +Γαλακτόζη (revision 8983758) +Φάλαινα (revision 9455804) +Εθνική Βιβλιοθήκη της Ισπανίας (revision 9771953) +Γλυκογόνο (revision 8033277) +Ισπανική γλώσσα (revision 9751022) +Φωνητικές χορδές (revision 9179304) +Κανονικές συνθήκες (revision 9776846) +Άλπεις (revision 9759633) +Αντίδραση συμπύκνωσης (revision 8965637) +National Diet Library (revision 9533181) +Εθνική Βιβλιοθήκη της Βραζιλίας (revision 9516238) +Homo sapiens sapiens (revision 9804050) +Εθνική Βιβλιοθήκη των Μπαρμπάντος (revision 9608141) +Μουντάνεουμ (revision 9387431) +Ζώο (revision 9797988) +Εθνική και Πανεπιστημιακή Βιβλιοθήκη της Ισλανδίας (revision 9510045) +Μόριο (revision 9737689) +Εθνική Βιβλιοθήκη της Ανδόρας (revision 9771949) +Βασίλειο (βιολογία) (revision 9171746) +Εθνική και Πανεπιστημιακή Βιβλιοθήκη «Άγιος Κλήμης της Αχρίδας» (revision 9608210) +Κλίμα (revision 9262599) +Δακτυλιοσκώληκες (revision 8985128) +Ασπάλακας (revision 9429446) +Μόλυνση (revision 8512424) International Standard Name Identifier (revision 6861942) -Ελευθερία, Ισότητα, Αδελφοσύνη (revision 8591437) -Συμβούλιο των Αντιπροσώπων του Ιράκ (revision 9812554) -Βιοεπιστήμες (revision 9175912) -21 Οκτωβρίου (revision 9123772) -Καμήλα (revision 9815157) -1526 (revision 7905955) -Μικτή οικονομία (revision 9756694) -Εμπραχίμ Ραΐσι (revision 9662244) -Βαθυσκάφος (revision 8755412) -Γιοχάνεσμπουργκ (revision 9234192) -Γιαζίντι (revision 9251594) -Πακιστάν (revision 9719399) -27 Νοεμβρίου (revision 9168386) -25 Οκτωβρίου (revision 9732587) -Ειρηνικός Ωκεανός (revision 9659005) -Βία κατά των γυναικών (revision 9404071) -Γιανίκ Νοά (revision 9721039) -Μεξικό (revision 9672139) -Ευρώπη (revision 9806540) -Ακκάδιοι (revision 8546428) -Είλωτες (revision 9472621) -Λεύκιππος (φιλόσοφος) (revision 8933581) -Δόλοπες (revision 9091900) -Ήλιος (revision 9797813) -Αντίς Αμπέμπα (revision 9703571) -4 Απριλίου (revision 9797052) -Μεγασθένης (revision 9397713) -Ισλαμική Συμβουλευτική Συνέλευση (revision 6895099) -Αμχαρική γλώσσα (revision 8252762) -Προφήτης Ιωνάς (revision 8981060) -Μεσοποταμία (revision 9478563) -Deutsche Welle (revision 9697594) -Θεοδόσιος Α' (revision 9717330) -Τανζανία (revision 9672789) -Μπαρόκ (revision 9498929) -Διαδηλώσεις στο Ιράν για το θάνατο της Μάχσα Αμίνι (revision 9746434) -Ρώμη (revision 9684590) -Εκκλησία (αρχιτεκτονική) (revision 9466423) -Integrated Authority File (revision 8518544) -Μπραζίλια (revision 9696231) -2η χιλιετία (revision 9650679) -Ιράν (revision 9804479) -Ολυμπιονίκης (revision 9767748) -The Guardian (revision 9533576) -Ανεξαρτησία (revision 9730220) -Σεισμός στο Αγκαντίρ (1960) (revision 9042146) -Γκέμπχαρντ Λέμπερεχτ φον Μπλύχερ (revision 9711616) -Αντρές Μανουέλ Λόπες Ομπραδόρ (revision 9591461) -Ηνωμένα Έθνη (revision 9596090) -Καρλομάγνος (revision 9776930) -Διάδης ο Πελλαίος (revision 8453992) -Ισλάμ (revision 9535017) -International Standard Serial Number (revision 9426410) -Πολ Βερλέν (revision 9620098) -Διεθνής πρότυπος αριθμός βιβλίου (revision 9525547) -Παντζάμποι (revision 9410265) -Θερινοί Ολυμπιακοί Αγώνες 2004 (revision 9646436) -Κεντρική Αφρική (revision 9666820) -1695 (revision 7905645) -1841 (revision 9476734) -Καράτζ (revision 7678423) -Αρχιτεκτονική (revision 9699724) -Κάλι Γιούγκα (revision 9173397) -Νέπιντο (revision 9786205) -Ησαΐας (revision 9285628) -Μιγκέλ Ιδάλγο ι Κοστίγια (revision 6788184) -Λατινικά (revision 9751029) -Ριγιάλ του Κατάρ (revision 9194726) -Καπιταλισμός (revision 9353276) -Σύστημα ταξινόμησης βιβλιοθήκης (revision 9648804) -Εκαταίος ο Μιλήσιος (revision 9332195) -Χαντίθ (revision 9423173) -24 Οκτωβρίου (revision 9123843) -Ατρείδες (revision 9269517) -Νερό (revision 9799579) -Χριστιανισμός στην Συρία (revision 9504011) +Υδροξυλομάδα (revision 9719647) +Εθνική Βιβλιοθήκη του Κουβέιτ (revision 9511761) +Homo rhodesiensis (revision 7605622) +Αγγλική γλώσσα (revision 9779698) +Περιβαλλοντική εκπαίδευση (revision 7971138) +Γουανίνη (revision 8392293) +Γριβάδι (revision 9370003) +Διεθνής Επιτροπή Στρωματογραφίας (revision 9796210) +Εχινόδερμα (revision 9101031) +Εθνική Βιβλιοθήκη των Φιλιππινών (revision 9511751) +Αρτίγονος (revision 9753577) +Εθνική Βιβλιοθήκη της Σρι Λάνκα (revision 9511705) +Περιβαλλοντικά προβλήματα (revision 9555971) +Υπερτάξη (revision 7554395) +Κατάλογος αντιστοιχίας Λατινικών-Ελληνικών όρων ταξινομικών μονάδων (revision 9562399) +Κόκκινος κατάλογος της IUCN (revision 9104016) +Κοινοβουλευτική Βιβλιοθήκη της Γεωργίας (revision 9508234) +Ασπόνδυλα (revision 9049085) +Τάξη (βιολογία) (revision 7554395) +Γρυλοβλαττοειδή (revision 6401187) +Γλυκόζη (revision 9770284) +Τουρκικές γλώσσες (revision 9284882) +Εκπνοή (revision 9611418) +Ανθρωπίνοι (revision 9103976) +Εθνική Βιβλιοθήκη του Μαυρικίου (revision 9736776) +Σαρκοφάγα (revision 8222140) +Χημική ένωση (revision 9478321) +Νουκλεοτίδια (revision 8520133) +Πλειστόκαινο (revision 9225169) +Υποοικογένεια (revision 8380547) +Πόδι (έντομα) (revision 7865328) +Δημόσια Βιβλιοθήκη Τσαρλς Α. Χάλμπερτ (revision 9607718) +Δισακχαρίτες (revision 9301054) +Νορβηγική γλώσσα (revision 9527903) +Σορβόζη (revision 9702780) +Bibliothèque nationale de France (revision 9636186) +1778 (revision 9509259) +Αμυλοπηκτίνη (revision 7348804) +Υφομοταξία (revision 9796614) +Κοινή καταγωγή (revision 7955205) +Βιβλιοθήκη του Βατικανού (revision 9791596) +Κράμα (revision 8491814) +Orrorin tugenensis (revision 8021796) +Εθνική Βιβλιοθήκη της Γερμανίας (revision 9533197) Εθνική Βιβλιοθήκη της Ελλάδος (revision 9771951) -Οθωμανικός Στρατός (revision 8724392) -Κάτεγατ (revision 9501508) -Βερμούδες (revision 9767755) -Floruit (revision 9328997) -Τρανσυλβανία (revision 9743855) -Κρυπτεία (revision 9689259) -Δημοκρατία της Ιρλανδίας (revision 9672626) -Κούρδοι (revision 9814315) -Οθέλλος (revision 9168875) -Κώμα (revision 7363577) -Συρία (revision 9703382) -Ευρασία (revision 9667362) -Θάλασσα του Σολομώντα (revision 9466850) -Πόλη του Μεξικού (revision 9692487) -Σημιτικές γλώσσες (revision 9595481) -Ισραήλ (revision 9777728) -Ναβουχοδονόσορ Β' (revision 9785243) -Αρδέννες (revision 9438776) -Λοτζ (revision 9812741) -Παρθένοι Νήσοι (revision 9466868) -Κεντρική Ευρώπη (revision 9712626) -Λάπις λάζουλι (revision 9356278) -Πόλεμος του Έβδομου Συνασπισμού (revision 9526274) -Βέλγιο (revision 9785761) -Γκιμαράες (revision 9256928) -Σπονδυλική στήλη (revision 9772196) -Κομμούνα του Παρισιού (Γαλλική επανάσταση) (revision 8737961) -Κοινοπολιτεία των Εθνών (revision 9713895) -Τηλεμετρία (revision 9300418) -Β΄ Παγκόσμιος Πόλεμος (revision 9796929) -Σουηδία (revision 9724663) +Κάρολος Λινναίος (revision 9170651) +Εθνική Βιβλιοθήκη της Μαυριτανίας (revision 9771958) +Εθνική Βιβλιοθήκη της Σαουδικής Αραβίας (revision 9777111) +Εθνική Βιβλιοθήκη της Ιορδανίας (revision 9510012) +Κλαδιστική (revision 7593647) +Κετόζες (revision 9015709) +Υδροξύλιο (revision 9719647) +Απειλούμενο είδος (revision 9387012) +Νέφος (revision 9753949) +Κατάρρινοι (revision 9802799) +Επικοινωνία (revision 9810024) +Χημικός τύπος (revision 9478340) +Εθνικά Αρχεία και Βιβλιοθήκη της Αιθιοπίας (revision 9608078) +Ολιγόκαινος εποχή (revision 8882927) == End of Parsed pages == -- Wikipedia parsing ended at: 2022-12-15 00:00:15.230612 +- Wikipedia parsing ended at: 2022-12-18 20:30:49.244663 -63 characters appeared 1687423 times. +62 characters appeared 918903 times. Most Frequent characters: -[ 0] Char α: 9.291090615690315 % -[ 1] Char ο: 8.043092929277366 % -[ 2] Char τ: 7.9854310389274055 % -[ 3] Char ι: 6.7272995567797755 % -[ 4] Char ν: 6.033816061532882 % -[ 5] Char ε: 5.973842954611855 % -[ 6] Char ρ: 4.455077357603873 % -[ 7] Char σ: 4.30638909153188 % -[ 8] Char κ: 4.299455441818679 % -[ 9] Char η: 3.817951989513003 % -[10] Char ς: 3.5992160827486646 % -[11] Char π: 3.4671804283810284 % -[12] Char μ: 3.293483613770821 % -[13] Char υ: 3.188708462549106 % -[14] Char λ: 2.825551151074745 % -[15] Char ί: 2.374330562046387 % -[16] Char ό: 1.996061449915048 % -[17] Char ά: 1.9162948472315477 % -[18] Char γ: 1.7252935393200164 % -[19] Char έ: 1.6340893777079015 % -[20] Char δ: 1.4231167881438145 % -[21] Char ω: 1.3993527408361743 % -[22] Char ή: 1.3272309314262043 % -[23] Char χ: 1.1665717487553506 % -[24] Char ύ: 1.0015271807958053 % -[25] Char θ: 0.9180863363839417 % -[26] Char β: 0.8104666109209131 % -[27] Char ώ: 0.779828175863432 % -[28] Char φ: 0.699231905692882 % -[29] Char ξ: 0.37246143972199025 % -[30] Char ζ: 0.29607276895004986 % -[31] Char e: 0.2829166130839748 % -[32] Char a: 0.25891551792289186 % -[33] Char i: 0.21506166503597496 % -[34] Char n: 0.19153466558177767 % -[35] Char r: 0.19141614165505627 % -[36] Char o: 0.18294168089447638 % -[37] Char s: 0.17677843670496374 % -[38] Char t: 0.15899984769675415 % -[39] Char l: 0.12883550834615862 % -[40] Char c: 0.10969389418065299 % -[41] Char d: 0.10281950643081196 % -[42] Char ψ: 0.09772297758179188 % +[ 0] Char α: 9.042630179681641 % +[ 1] Char ο: 7.761537398397872 % +[ 2] Char τ: 7.389680956531865 % +[ 3] Char ι: 7.071584269503963 % +[ 4] Char ν: 6.1224090029089036 % +[ 5] Char ε: 5.937188147171138 % +[ 6] Char κ: 4.257359046602308 % +[ 7] Char ρ: 4.217311294010358 % +[ 8] Char σ: 4.050373107934135 % +[ 9] Char η: 3.6424954538183028 % +[10] Char π: 3.53171118170253 % +[11] Char ς: 3.3343018795237365 % +[12] Char μ: 3.2733596473185957 % +[13] Char υ: 3.02023173283796 % +[14] Char λ: 2.6589313561932 % +[15] Char ί: 2.381426548830508 % +[16] Char ό: 1.9545044471505697 % +[17] Char ά: 1.8594998601593422 % +[18] Char γ: 1.7558980654106038 % +[19] Char δ: 1.6237840120230318 % +[20] Char έ: 1.569806606355622 % +[21] Char ω: 1.5474973963519545 % +[22] Char ή: 1.323969994656672 % +[23] Char χ: 1.1194870405254962 % +[24] Char ύ: 1.0730185884690766 % +[25] Char θ: 1.0217618181679675 % +[26] Char ώ: 0.7902901612030867 % +[27] Char φ: 0.7704839357364162 % +[28] Char β: 0.7675456495408112 % +[29] Char ξ: 0.4437900409510035 % +[30] Char ζ: 0.4305133403634551 % +[31] Char a: 0.4036334629444022 % +[32] Char e: 0.39601568391875963 % +[33] Char i: 0.3618445037180203 % +[34] Char n: 0.3161378295641651 % +[35] Char o: 0.31026125717295516 % +[36] Char s: 0.2842519830711185 % +[37] Char r: 0.2519308349194638 % +[38] Char t: 0.23560702272165832 % +[39] Char l: 0.20110936627696285 % +[40] Char c: 0.19925933422787825 % +[41] Char h: 0.1609527882703615 % +[42] Char d: 0.14419367441394795 % +[43] Char u: 0.13690237163226152 % +[44] Char m: 0.1365758953883054 % +[45] Char p: 0.11328725665276966 % +[46] Char ψ: 0.10240471518756604 % -The first 43 characters have an accumulated ratio of 0.9924523963463813. -The first 6 characters have an accumulated ratio of 0.440545731568196. -All characters whose order is over 27 have an accumulated ratio of 0.03465402569480207. +The first 47 characters have an accumulated ratio of 0.9949875014011275. +The first 6 characters have an accumulated ratio of 0.4332502995419538. +All characters whose order is over 31 have an accumulated ratio of 0.03350734517136193. -1515 sequences found. +1389 sequences found. -First 852 (typical positive ratio): 0.9950198012242328 -Next 229 (1081-852): 0.003981133733535591 -Rest: 0.0009990650422315728 +First 849 (typical positive ratio): 0.9950207709120384 +Next 223 (1072-849): 0.003984435961508326 +Rest: 0.0009947931264532306 -- Processing end: 2022-12-15 00:00:15.353968 +- Processing end: 2022-12-18 20:30:49.348223 diff --git a/script/charsets/cp737.py b/script/charsets/cp737.py new file mode 100644 index 0000000..be9c4bc --- /dev/null +++ b/script/charsets/cp737.py @@ -0,0 +1,79 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# ##### BEGIN LICENSE BLOCK ##### +# Version: MPL 1.1/GPL 2.0/LGPL 2.1 +# +# The contents of this file are subject to the Mozilla Public License Version +# 1.1 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# http://www.mozilla.org/MPL/ +# +# Software distributed under the License is distributed on an "AS IS" basis, +# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License +# for the specific language governing rights and limitations under the +# License. +# +# The Original Code is Mozilla Universal charset detector code. +# +# The Initial Developer of the Original Code is +# Netscape Communications Corporation. +# Portions created by the Initial Developer are Copyright (C) 2001 +# the Initial Developer. All Rights Reserved. +# +# Contributor(s): +# Jehan <jehan@girinstud.io> +# +# Alternatively, the contents of this file may be used under the terms of +# either the GNU General Public License Version 2 or later (the "GPL"), or +# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), +# in which case the provisions of the GPL or the LGPL are applicable instead +# of those above. If you wish to allow use of your version of this file only +# under the terms of either the GPL or the LGPL, and not to allow others to +# use your version of this file under the terms of the MPL, indicate your +# decision by deleting the provisions above and replace them with the notice +# and other provisions required by the GPL or the LGPL. If you do not delete +# the provisions above, a recipient may use your version of this file under +# the terms of any one of the MPL, the GPL or the LGPL. +# +# ##### END LICENSE BLOCK ##### + +from codepoints import * + +# NOTE: I use CP737 and not IBM737 as the main encoding name, since iconv +# conversion failed with IBM737 with the file from #21 and in BuildLangModel.py +# script, even though these are supposed to be synonyms. +name = 'CP737' +aliases = ['IBM737', 'OEM 737', 'MS-DOS Greek'] + +language = \ +{ + # Wikipedia tells us: Code page 737 (CCSID 737) (also known as CP 737, + # IBM 00737, and OEM 737, MS-DOS Greek) is a code page used under DOS to + # write the Greek language.[4] It was much more popular than code page + # 869 although it lacks the letters ΐ and ΰ. + 'complete': [ 'el' ], + 'incomplete': [] +} + +# X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF # +charmap = \ +[ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, # 0X + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, # 1X + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # 2X + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, # 3X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 4X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,SYM, # 5X + SYM,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 6X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,SYM,SYM,SYM,SYM,CTR, # 7X + + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 8X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # 9X + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # AX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # BX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # CX + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # DX + LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET,LET, # EX + LET,SYM,SYM,SYM,LET,LET,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, # FX +] diff --git a/script/langs/el.py b/script/langs/el.py index 4c8352b..8bc6406 100644 --- a/script/langs/el.py +++ b/script/langs/el.py @@ -45,11 +45,11 @@ import re name = 'Greek' code = 'el' use_ascii = False -charsets = ['ISO-8859-7', 'WINDOWS-1253'] +charsets = ['ISO-8859-7', 'WINDOWS-1253', 'CP737'] ## Optional Properties ## alphabet = 'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' -start_pages = ['Πύλη:Κύρια'] +start_pages = ['Πρωτεύοντα'] wikipedia_code = code case_mapping = True diff --git a/src/LangModels/LangGreekModel.cpp b/src/LangModels/LangGreekModel.cpp index 4825977..14ada24 100644 --- a/src/LangModels/LangGreekModel.cpp +++ b/src/LangModels/LangGreekModel.cpp @@ -38,50 +38,51 @@ #include "../nsSBCharSetProber.h" #include "../nsSBCharSetProber-generated.h" #include "../nsLanguageDetector.h" + #include "../nsLanguageDetector-generated.h" /********* Language model for: Greek *********/ /** * Generated by BuildLangModel.py - * On: 2022-12-15 00:00:15.231612 + * On: 2022-12-18 20:30:49.245016 **/ -/* Character Mapping Table: - * ILL: illegal character. - * CTR: control character specific to the charset. - * RET: carriage/return. - * SYM: symbol (punctuation) that does not belong to word. - * NUM: 0 - 9. - * - * Other characters are ordered by probabilities - * (0 is the most common character in the language). - * - * Orders are generic to a language. So the codepoint with order X in - * CHARSET1 maps to the same character as the codepoint with the same - * order X in CHARSET2 for the same language. - * As such, it is possible to get missing order. For instance the - * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 - * even though they are both used for French. Same for the euro sign. - */ -static const unsigned char Iso_8859_7_CharToOrderMap[] = + /* Character Mapping Table: + * ILL: illegal character. + * CTR: control character specific to the charset. + * RET: carriage/return. + * SYM: symbol (punctuation) that does not belong to word. + * NUM: 0 - 9. + * + * Other characters are ordered by probabilities + * (0 is the most common character in the language). + * + * Orders are generic to a language. So the codepoint with order X in + * CHARSET1 maps to the same character as the codepoint with the same + * order X in CHARSET2 for the same language. + * As such, it is possible to get missing order. For instance the + * ligature of 'o' and 'e' exists in ISO-8859-15 but not in ISO-8859-1 + * even though they are both used for French. Same for the euro sign. + */ + static const unsigned char Iso_8859_7_CharToOrderMap[] = { CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */ - 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */ - 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 8X */ CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 9X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */ - 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */ - 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */ - 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */ - 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */ + SYM,SYM,SYM,SYM,SYM,SYM, 17,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */ + 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */ + 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */ + 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */ + 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ @@ -91,134 +92,164 @@ static const unsigned char Windows_1253_CharToOrderMap[] = CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ - SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 4X */ - 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,SYM, /* 5X */ - SYM, 32, 48, 40, 41, 31, 50, 49, 44, 33, 55, 54, 39, 43, 34, 36, /* 6X */ - 47, 59, 35, 37, 38, 45, 52, 53, 58, 51, 57,SYM,SYM,SYM,SYM,CTR, /* 7X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */ SYM,ILL,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 8X */ ILL,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,ILL,SYM,ILL,ILL,ILL,ILL, /* 9X */ SYM,SYM, 17,SYM,SYM,SYM,SYM,SYM,SYM,SYM,ILL,SYM,SYM,SYM,SYM,SYM, /* AX */ - SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 19, 22, 15,SYM, 16,SYM, 24, 27, /* BX */ - 56, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* CX */ - 11, 6,ILL, 7, 2, 13, 28, 23, 42, 21, 46, 60, 17, 19, 22, 15, /* DX */ - 61, 0, 26, 18, 20, 5, 30, 9, 25, 3, 8, 14, 12, 4, 29, 1, /* EX */ - 11, 6, 10, 7, 2, 13, 28, 23, 42, 21, 46, 60, 16, 24, 27,ILL, /* FX */ + SYM,SYM,SYM,SYM,SYM, 62,SYM,SYM, 20, 22, 15,SYM, 16,SYM, 24, 26, /* BX */ + 58, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* CX */ + 10, 7,ILL, 8, 2, 13, 27, 23, 46, 21, 49, 60, 17, 20, 22, 15, /* DX */ + 61, 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, /* EX */ + 10, 7, 11, 8, 2, 13, 27, 23, 46, 21, 49, 60, 16, 24, 26,ILL, /* FX */ +}; +/*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ + +static const unsigned char Cp737_CharToOrderMap[] = +{ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,RET,CTR,CTR,RET,CTR,CTR, /* 0X */ + CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR,CTR, /* 1X */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* 2X */ + NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,NUM,SYM,SYM,SYM,SYM,SYM,SYM, /* 3X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 4X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,SYM, /* 5X */ + SYM, 31, 48, 40, 42, 32, 50, 47, 41, 33, 55, 52, 39, 44, 34, 35, /* 6X */ + 45, 59, 37, 36, 38, 43, 53, 54, 57, 51, 56,SYM,SYM,SYM,SYM,CTR, /* 7X */ + 0, 28, 18, 19, 5, 30, 9, 25, 3, 6, 14, 12, 4, 29, 1, 10, /* 8X */ + 7, 8, 2, 13, 27, 23, 46, 21, 0, 28, 18, 19, 5, 30, 9, 25, /* 9X */ + 3, 6, 14, 12, 4, 29, 1, 10, 7, 8, 11, 2, 13, 27, 23, 46, /* AX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* BX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* CX */ + SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* DX */ + 21, 17, 20, 22, 49, 15, 16, 24, 60, 26, 17, 20, 22, 15, 16, 24, /* EX */ + 26,SYM,SYM,SYM, 49, 60,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM,SYM, /* FX */ }; /*X0 X1 X2 X3 X4 X5 X6 X7 X8 X9 XA XB XC XD XE XF */ -static const int Unicode_Char_size = 86; +static const int Unicode_Char_size = 94; static const unsigned int Unicode_CharOrder[] = { - 65, 32, 67, 40, 68, 41, 69, 31, 73, 33, 76, 39, 78, 34, 79, 36, - 82, 35, 83, 37, 84, 38, 97, 32, 99, 40, 100, 41, 101, 31,105, 33, - 108, 39, 110, 34, 111, 36, 114, 35, 115, 37, 116, 38, 902, 17,904, 19, - 905, 22, 906, 15, 908, 16, 910, 24, 911, 27, 913, 0, 914, 26,915, 18, - 916, 20, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 8,923, 14, - 924, 12, 925, 4, 926, 29, 927, 1, 928, 11, 929, 6, 931, 7,931, 10, - 932, 2, 933, 13, 934, 28, 935, 23, 936, 42, 937, 21, 940, 17,941, 19, - 942, 22, 943, 15, 945, 0, 946, 26, 947, 18, 948, 20, 949, 5,950, 30, - 951, 9, 952, 25, 953, 3, 954, 8, 955, 14, 956, 12, 957, 4,958, 29, - 959, 1, 960, 11, 961, 6, 962, 10, 963, 7, 964, 2, 965, 13,966, 28, - 967, 23, 968, 42, 969, 21, 972, 16, 973, 24, 974, 27, + 65, 31, 67, 40, 68, 42, 69, 32, 72, 41, 73, 33, 76, 39, 77, 44, + 78, 34, 79, 35, 80, 45, 82, 37, 83, 36, 84, 38, 85, 43, 97, 31, + 99, 40, 100, 42, 101, 32, 104, 41, 105, 33, 108, 39, 109, 44,110, 34, + 111, 35, 112, 45, 114, 37, 115, 36, 116, 38, 117, 43, 902, 17,904, 20, + 905, 22, 906, 15, 908, 16, 910, 24, 911, 26, 913, 0, 914, 28,915, 18, + 916, 19, 917, 5, 918, 30, 919, 9, 920, 25, 921, 3, 922, 6,923, 14, + 924, 12, 925, 4, 926, 29, 927, 1, 928, 10, 929, 7, 931, 8,931, 11, + 932, 2, 933, 13, 934, 27, 935, 23, 936, 46, 937, 21, 940, 17,941, 20, + 942, 22, 943, 15, 945, 0, 946, 28, 947, 18, 948, 19, 949, 5,950, 30, + 951, 9, 952, 25, 953, 3, 954, 6, 955, 14, 956, 12, 957, 4,958, 29, + 959, 1, 960, 10, 961, 7, 962, 11, 963, 8, 964, 2, 965, 13,966, 27, + 967, 23, 968, 46, 969, 21, 972, 16, 973, 24, 974, 26, }; -/* Model Table: - * Total considered sequences: 1515 / 1849 - * - Positive sequences: first 852 (0.9950198012242328) - * - Probable sequences: next 229 (1081-852) (0.003981133733535591) - * - Neutral sequences: last 768 (0.0009990650422315728) - * - Negative sequences: 334 (off-ratio) - * Negative sequences: TODO + /* Model Table: + * Total considered sequences: 1389 / 2209 + * - Positive sequences: first 849 (0.9950207709120384) + * - Probable sequences: next 223 (1072-849) (0.003984435961508326) + * - Neutral sequences: last 1137 (0.0009947931264532306) + * - Negative sequences: 820 (off-ratio) + * Negative sequences: TODO */ static const PRUint8 GreekLangModel[] = { - 2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3, - 0,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,3,3,3,3, - 2,3,3,3,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 3,3,3,3,3,3,3,3,1,3,3,1,3,3,3,3,3,3,2,3,0, - 3,3,2,3,3,2,3,2,0,3,0,0,0,0,0,2,0,0,0,0,0,0, - 3,3,3,2,3,3,3,3,3,3,3,3,3,1,3,2,3,3,3,3,3, - 3,3,3,1,3,3,3,3,3,3,0,0,0,0,0,1,0,0,0,0,0,3, - 3,3,3,3,3,3,2,3,3,3,3,1,2,3,3,3,3,3,3,3,3, - 3,3,3,3,3,1,3,2,1,3,0,1,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,3,3,2,3, - 3,2,3,3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,3, - 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 3,3,3,3,3,3,3,3,3,2,0,0,0,0,0,1,0,0,0,0,0,1, - 3,3,3,3,2,3,3,3,3,3,0,3,3,3,3,3,3,3,2,3,3, - 3,3,3,3,3,3,3,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3,1,3,3, - 3,3,2,3,2,2,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,3,1,3,3,3,3,3,0,3,3,3,0,3,1,0,0,3,1,3, - 0,0,3,1,3,2,0,3,3,0,1,0,0,0,0,0,0,0,0,0,0,3, - 1,1,1,0,0,1,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0, - 0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,3,3,3,1,1,3,1,3,0,3,3,3,3,3,1,3,1, - 3,3,1,3,1,1,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,2,3,3,3,1,3,1,3,2,3,3,3,1,3,3,3,1,3,0, - 3,3,1,3,0,3,3,3,0,1,0,0,0,0,0,1,1,0,0,0,0,2, - 3,3,3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3, - 0,3,3,0,3,3,2,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 3,3,3,3,3,3,1,3,3,3,2,3,3,3,3,3,3,3,3,3,3, - 3,3,2,3,3,3,3,3,1,1,0,0,0,0,0,1,0,0,0,0,0,2, - 3,3,3,1,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,0,3, - 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 2,3,3,2,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,0,3, - 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2, - 3,3,3,3,3,3,3,3,3,2,3,3,3,1,3,0,0,0,3,1,3, - 1,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,2, - 3,3,1,3,3,3,3,0,3,3,0,0,3,3,3,3,3,3,3,3,3, - 3,3,3,3,0,1,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,3,3,3,3,3,3,1,3,3,3,1,3,0,0,0,3,0,3, - 3,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 3,3,0,3,2,3,3,0,1,3,0,0,1,3,0,3,3,3,1,3,1, - 3,3,0,3,0,0,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,2,3,2,3,1,3,3,3,1,3,3,3,1,3,2,1,3,3,1,3, - 0,3,2,0,3,3,1,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,3,2,3,2,3,3,3,1,3,3,3,0,3,0,0,0,3,1,3, - 0,0,3,0,3,1,0,3,2,1,0,0,0,0,0,0,0,0,0,0,0,2, - 3,3,3,3,3,3,3,1,0,3,1,0,3,3,3,3,3,3,0,3,0, - 3,3,0,3,3,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,0,3, - 2,0,3,0,3,3,0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,3, - 3,3,0,3,3,3,3,0,1,3,1,0,3,3,3,3,3,3,0,3,0, - 3,3,0,3,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,1,3,1,3,3,1,1,3,1,0,1,3,3,3,3,3,2,3,3, - 3,3,0,2,0,2,2,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0, - 2,2,3,2,3,1,3,3,2,3,3,3,3,0,3,0,0,0,1,0,3, - 2,0,2,0,3,3,0,1,2,2,0,0,0,0,0,0,0,0,0,0,0,1, - 3,3,3,3,2,3,3,2,0,3,0,0,1,3,3,3,3,3,2,3,0, - 3,3,0,3,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0, - 3,3,1,3,0,3,0,0,1,3,0,2,1,3,0,3,2,3,1,3,0, - 3,3,0,3,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0, - 3,3,1,3,2,3,1,1,1,3,0,1,2,2,1,3,3,3,1,3,1, - 3,3,1,2,0,1,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0, - 0,1,1,0,0,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0, - 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,2,3,3,3,2,3,3,3,3,3,0, - 0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,2,3,3,3,3,3,3,3,3,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,3,2,3,3,3,2,3,3,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,1,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,0, - 0,1,1,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0, - 0,0,0,0,1,1,0,0,0,0,2,2,2,3,3,3,3,3,3,3,3,0, - 0,1,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,2,2,3,3,3,2,3,1,0, - 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,1,1,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,1,1,3,2,2,3,2,3,0, - 0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,1,3,3,2,3,3,2,1,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,3,3,3,2,3,3,2,1,2,1,2,0, - 3,3,1,2,0,3,0,0,0,3,0,0,0,3,0,3,2,2,0,1,0, - 2,3,0,1,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,0,3,3,3,0,2, + 3,3,3,1,3,3,3,3,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,2,2,3,3,3,2,3, + 3,3,3,2,3,3,3,3,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,3, + 3,3,3,3,2,3,2,3,3,3,1,2,3,3,3,3,3,3,1,1,3,3,3, + 1,3,2,3,1,1,0,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,1,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,3,3,3, + 3,1,3,3,3,3,3,3,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,3, + 3,3,3,3,3,3,3,2,3,3,2,2,3,3,1,3,3,3,3,3,3,3,3, + 2,3,3,3,1,1,1,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,2,3,3,3,2,3,3,3,3,3,3,3,3,3,3,2,3,1, + 3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,3,3,2,3,3,3,3, + 3,3,3,3,3,2,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,3,3,1,3,3,2,3,3,3,3,3,3,3,3, + 3,3,3,3,3,3,3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,1,3,3,2,3,3,3,0,3,3,3,3,3,3,1,3,3,3,3, + 3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,3,0,3,2,3,3,3,0,3,3,3,0,3,0,0,0,3,3,1,1,0, + 3,1,3,2,3,3,3,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,3, + 3,3,3,3,3,3,0,3,1,3,3,1,0,3,3,3,3,3,0,0,3,3,3, + 0,3,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,1,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0, + 3,3,2,3,3,3,1,1,1,3,3,1,3,3,1,3,3,3,1,0,3,3,3, + 0,3,0,3,3,3,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,0,3,2,3,3,3,3,3,3,3, + 3,0,3,2,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,2,3,3,1,2,3,3,2,3,3,3,3,3,3,2,3,3,3,3, + 1,3,3,3,3,3,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1, + 3,3,3,0,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,3,0, + 3,0,3,0,2,3,3,3,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,2, + 1,3,3,2,3,2,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0, + 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 2,3,3,3,3,3,3,3,3,3,3,3,3,0,3,0,0,0,3,3,0,1,0, + 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,0,3,3,3,3,3,0,3,0,0,3,3,3,3,3,3,3,2,3,3,3, + 3,3,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,3,1,3,2,3,0,3,0,0,1,3,1,3,3,3,0,0,3,3,3, + 0,3,0,3,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0, + 3,3,3,3,3,3,3,3,3,0,3,3,3,1,3,0,0,0,3,3,0,3,0, + 3,0,3,0,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,3,3,3,1,3,3,3,1,3,3,3,0,3,0,1,2,3,3,0,1,3, + 2,0,3,1,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,1,3,1,3,1,3,3,3,0,3,3,3,0,3,0,0,0,3,3,0,0,0, + 2,0,3,0,2,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2, + 3,3,3,3,3,3,0,3,0,3,1,0,2,3,3,3,3,3,0,0,3,3,3, + 0,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 2,3,3,2,3,3,3,3,3,2,3,3,3,0,3,0,0,0,3,3,0,2,0, + 3,0,3,0,3,2,3,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3, + 3,3,0,3,3,3,0,3,0,3,0,0,3,3,2,2,3,3,0,0,3,3,3, + 0,3,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,3,1,3,3,3,2,3,3,3,0,3,0,0,0,2,3,0,3,0, + 2,0,3,0,0,2,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,3,3,1,3,0,3,1,3,0,0,1,3,3,3,3,3,1,0,3,3,3, + 0,3,3,3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,0,3,0,3,0,3,0,3,0,1,0,2,3,3,3,3,2,2,3,2,3, + 0,2,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,2,3,0,3,0,0,0,3,0,0,0,3,1,3,2,3,0,0,3,3,3, + 0,3,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 3,3,1,3,1,3,0,0,0,3,0,0,1,3,0,3,3,2,0,0,3,3,3, + 0,2,0,3,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,2,3,3,3,2,3,3,3,3,3,2,3,3,3,3,0, + 0,1,1,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0, + 0,0,1,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0, + 0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,1,3,3,3,3,0, + 0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,3,3,3,2,3,2,3,1,3,3,2,1,0, + 0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0, + 0,0,0,0,0,0,0,0,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,0, + 0,1,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,2,3,3,1,3,2,3,3,1,3,2,3,0, + 0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,0, + 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,2,2,3,1,3,2,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,1,3,3,0,3,3,2,1,3,3,2,2,0, + 0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,3,3,2,2,3,3,2,3,1,3,2,2,0, + 0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,2,3,2,3,3,2,2,1,2,3,1,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,3,3,2,3,1,3,1,1,2,3,2,1,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,2,3,2,3,3,3,3,3,1,3,1,3,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,2,3,2,1,1,1,2,0,1,3,3,3,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,3,3,3,1,3,3,3,3,3,1,3,2,3,2,2,0, + 3,3,0,2,0,3,0,0,0,3,0,0,0,3,0,3,2,3,0,0,0,2,2, + 0,2,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, }; @@ -226,8 +257,8 @@ const SequenceModel Iso_8859_7GreekModel = { Iso_8859_7_CharToOrderMap, GreekLangModel, - 43, - (float)0.9990009349577684, + 47, + (float)0.9990052068735468, PR_FALSE, "ISO-8859-7", "el" @@ -237,22 +268,33 @@ const SequenceModel Windows_1253GreekModel = { Windows_1253_CharToOrderMap, GreekLangModel, - 43, - (float)0.9990009349577684, + 47, + (float)0.9990052068735468, PR_FALSE, "WINDOWS-1253", "el" }; +const SequenceModel Cp737GreekModel = +{ + Cp737_CharToOrderMap, + GreekLangModel, + 47, + (float)0.9990052068735468, + PR_FALSE, + "CP737", + "el" +}; + const LanguageModel GreekModel = { "el", Unicode_CharOrder, - 86, + 94, GreekLangModel, - 43, + 47, 6, - (float)0.440545731568196, - 27, - (float)0.03465402569480207, + (float)0.4332502995419538, + 31, + (float)0.03350734517136193, }; diff --git a/src/nsSBCSGroupProber.cpp b/src/nsSBCSGroupProber.cpp index b0aa01a..9bf3ad3 100644 --- a/src/nsSBCSGroupProber.cpp +++ b/src/nsSBCSGroupProber.cpp @@ -69,6 +69,7 @@ nsSBCSGroupProber::nsSBCSGroupProber() mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_7GreekModel); mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1253GreekModel); + mProbers[n++] = new nsSingleByteCharSetProber(&Cp737GreekModel); mProbers[n++] = new nsSingleByteCharSetProber(&Iso_8859_5BulgarianModel); mProbers[n++] = new nsSingleByteCharSetProber(&Windows_1251BulgarianModel); diff --git a/src/nsSBCharSetProber-generated.h b/src/nsSBCharSetProber-generated.h index fa54561..86dbae5 100644 --- a/src/nsSBCharSetProber-generated.h +++ b/src/nsSBCharSetProber-generated.h @@ -38,7 +38,7 @@ #ifndef nsSingleByteCharSetProber_generated_h__ #define nsSingleByteCharSetProber_generated_h__ -#define NUM_OF_SEQUENCE_MODELS 115 +#define NUM_OF_SEQUENCE_MODELS 116 extern const SequenceModel Iso_8859_6ArabicModel; extern const SequenceModel Windows_1256ArabicModel; @@ -64,6 +64,7 @@ extern const SequenceModel Windows_1252GermanModel; extern const SequenceModel Iso_8859_7GreekModel; extern const SequenceModel Windows_1253GreekModel; +extern const SequenceModel Cp737GreekModel; extern const SequenceModel Iso_8859_1EnglishModel; extern const SequenceModel Windows_1252EnglishModel; diff --git a/test/el/cp737.txt b/test/el/cp737.txt new file mode 100644 index 0000000..4c8c14c --- /dev/null +++ b/test/el/cp737.txt @@ -0,0 +1 @@ + 櫘 夘 ⤦ 磜 ⩩ 回 ⤦ Marmota, 餫 樜 . ᭦ 回 夘 婫 . |