diff options
author | Jehan <jehan@girinstud.io> | 2021-03-17 17:15:56 +0100 |
---|---|---|
committer | Jehan <jehan@girinstud.io> | 2022-12-14 00:23:13 +0100 |
commit | 2a16ab2310eef71a0595e6616b3d24a3a6267bb0 (patch) | |
tree | 542183e919f04a5d80d9ae204eb00a336a2c86b5 | |
parent | 6138d9e0f0f83a63b45f1f361cb59a0c5f1d561c (diff) |
src: nsEscCharsetProber also returns the correct language.
nsEscCharsetProber will still only return a single candidate, because
this is detected by a state machine, not language statistics anyway.
Anyway now it will also return the language attached to the encoding.
-rw-r--r-- | src/nsCodingStateMachine.h | 2 | ||||
-rw-r--r-- | src/nsEscCharsetProber.cpp | 17 | ||||
-rw-r--r-- | src/nsEscCharsetProber.h | 3 | ||||
-rw-r--r-- | src/nsEscSM.cpp | 1 | ||||
-rw-r--r-- | src/nsUniversalDetector.cpp | 3 | ||||
-rw-r--r-- | src/nsUniversalDetector.h | 1 |
6 files changed, 21 insertions, 6 deletions
diff --git a/src/nsCodingStateMachine.h b/src/nsCodingStateMachine.h index 8861118..dc5b7c2 100644 --- a/src/nsCodingStateMachine.h +++ b/src/nsCodingStateMachine.h @@ -77,7 +77,7 @@ public: } PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;} void Reset(void) {mCurrentState = eStart;} - const char * GetCodingStateMachine() {return mModel->name;} + const SMModel* GetCodingStateMachine() {return mModel;} protected: PRUint32 mCurrentState; diff --git a/src/nsEscCharsetProber.cpp b/src/nsEscCharsetProber.cpp index 4c31105..ff76552 100644 --- a/src/nsEscCharsetProber.cpp +++ b/src/nsEscCharsetProber.cpp @@ -55,6 +55,7 @@ nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter) mActiveSM = NUM_OF_ESC_CHARSETS; mState = eDetecting; mDetectedCharset = nsnull; + mDetectedLang = nsnull; } nsEscCharSetProber::~nsEscCharSetProber(void) @@ -71,8 +72,10 @@ void nsEscCharSetProber::Reset(void) mCodingSM[i]->Reset(); mActiveSM = NUM_OF_ESC_CHARSETS; mDetectedCharset = nsnull; + mDetectedLang = nsnull; } +#include <cstdio> nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, int** codePointBuffer, int* codePointBufferIdx) @@ -90,8 +93,19 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, codingState = mCodingSM[j]->NextState(aBuf[i]); if (codingState == eItsMe) { + const SMModel *model = mCodingSM[j]->GetCodingStateMachine(); + mState = eFoundIt; - mDetectedCharset = mCodingSM[j]->GetCodingStateMachine(); + mDetectedCharset = model->name; + + if (model == &HZSMModel || + model == &ISO2022CNSMModel) + mDetectedLang = "zh"; + else if (model == &ISO2022JPSMModel) + mDetectedLang = "ja"; + else if (model == &ISO2022KRSMModel) + mDetectedLang = "ko"; + return mState; } } @@ -100,4 +114,3 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen, return mState; } - diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h index e3167da..9f634a8 100644 --- a/src/nsEscCharsetProber.h +++ b/src/nsEscCharsetProber.h @@ -54,7 +54,7 @@ public: int* codePointBufferIdx); virtual int GetCandidates() { return 1; } const char* GetCharSetName(int) {return mDetectedCharset;} - const char* GetLanguage(int) {return NULL;} + const char* GetLanguage(int) {return mDetectedLang;} nsProbingState GetState(void) {return mState;} void Reset(void); float GetConfidence(int){return (float)0.99;} @@ -67,6 +67,7 @@ protected: PRUint32 mActiveSM; nsProbingState mState; const char * mDetectedCharset; + const char * mDetectedLang; }; #endif /* nsEscCharSetProber_h__ */ diff --git a/src/nsEscSM.cpp b/src/nsEscSM.cpp index dcc252c..cf73c2f 100644 --- a/src/nsEscSM.cpp +++ b/src/nsEscSM.cpp @@ -264,4 +264,3 @@ const SMModel ISO2022KRSMModel = { ISO2022KRCharLenTable, "ISO-2022-KR", }; - diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp index 6695aff..b143d9f 100644 --- a/src/nsUniversalDetector.cpp +++ b/src/nsUniversalDetector.cpp @@ -248,6 +248,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen) { shortcutCharset = mEscCharSetProber->GetCharSetName(0); shortcutConfidence = mEscCharSetProber->GetConfidence(0); + shortcutLanguage = mEscCharSetProber->GetLanguage(0); mDone = PR_TRUE; } break; @@ -313,7 +314,7 @@ void nsUniversalDetector::DataEnd() * when finding them. */ mDone = PR_TRUE; - Report(shortcutCharset, NULL, shortcutConfidence); + Report(shortcutCharset, shortcutLanguage, shortcutConfidence); return; } diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h index a286ed9..e7df39b 100644 --- a/src/nsUniversalDetector.h +++ b/src/nsUniversalDetector.h @@ -81,6 +81,7 @@ protected: PRBool mGotData; char mLastChar; const char * shortcutCharset; + const char * shortcutLanguage; float shortcutConfidence; PRInt32 mBestGuess; |