summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2021-03-17 17:15:56 +0100
committerJehan <jehan@girinstud.io>2022-12-14 00:23:13 +0100
commit2a16ab2310eef71a0595e6616b3d24a3a6267bb0 (patch)
tree542183e919f04a5d80d9ae204eb00a336a2c86b5
parent6138d9e0f0f83a63b45f1f361cb59a0c5f1d561c (diff)
src: nsEscCharsetProber also returns the correct language.
nsEscCharsetProber will still only return a single candidate, because this is detected by a state machine, not language statistics anyway. Anyway now it will also return the language attached to the encoding.
-rw-r--r--src/nsCodingStateMachine.h2
-rw-r--r--src/nsEscCharsetProber.cpp17
-rw-r--r--src/nsEscCharsetProber.h3
-rw-r--r--src/nsEscSM.cpp1
-rw-r--r--src/nsUniversalDetector.cpp3
-rw-r--r--src/nsUniversalDetector.h1
6 files changed, 21 insertions, 6 deletions
diff --git a/src/nsCodingStateMachine.h b/src/nsCodingStateMachine.h
index 8861118..dc5b7c2 100644
--- a/src/nsCodingStateMachine.h
+++ b/src/nsCodingStateMachine.h
@@ -77,7 +77,7 @@ public:
}
PRUint32 GetCurrentCharLen(void) {return mCurrentCharLen;}
void Reset(void) {mCurrentState = eStart;}
- const char * GetCodingStateMachine() {return mModel->name;}
+ const SMModel* GetCodingStateMachine() {return mModel;}
protected:
PRUint32 mCurrentState;
diff --git a/src/nsEscCharsetProber.cpp b/src/nsEscCharsetProber.cpp
index 4c31105..ff76552 100644
--- a/src/nsEscCharsetProber.cpp
+++ b/src/nsEscCharsetProber.cpp
@@ -55,6 +55,7 @@ nsEscCharSetProber::nsEscCharSetProber(PRUint32 aLanguageFilter)
mActiveSM = NUM_OF_ESC_CHARSETS;
mState = eDetecting;
mDetectedCharset = nsnull;
+ mDetectedLang = nsnull;
}
nsEscCharSetProber::~nsEscCharSetProber(void)
@@ -71,8 +72,10 @@ void nsEscCharSetProber::Reset(void)
mCodingSM[i]->Reset();
mActiveSM = NUM_OF_ESC_CHARSETS;
mDetectedCharset = nsnull;
+ mDetectedLang = nsnull;
}
+#include <cstdio>
nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
int** codePointBuffer,
int* codePointBufferIdx)
@@ -90,8 +93,19 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
codingState = mCodingSM[j]->NextState(aBuf[i]);
if (codingState == eItsMe)
{
+ const SMModel *model = mCodingSM[j]->GetCodingStateMachine();
+
mState = eFoundIt;
- mDetectedCharset = mCodingSM[j]->GetCodingStateMachine();
+ mDetectedCharset = model->name;
+
+ if (model == &HZSMModel ||
+ model == &ISO2022CNSMModel)
+ mDetectedLang = "zh";
+ else if (model == &ISO2022JPSMModel)
+ mDetectedLang = "ja";
+ else if (model == &ISO2022KRSMModel)
+ mDetectedLang = "ko";
+
return mState;
}
}
@@ -100,4 +114,3 @@ nsProbingState nsEscCharSetProber::HandleData(const char* aBuf, PRUint32 aLen,
return mState;
}
-
diff --git a/src/nsEscCharsetProber.h b/src/nsEscCharsetProber.h
index e3167da..9f634a8 100644
--- a/src/nsEscCharsetProber.h
+++ b/src/nsEscCharsetProber.h
@@ -54,7 +54,7 @@ public:
int* codePointBufferIdx);
virtual int GetCandidates() { return 1; }
const char* GetCharSetName(int) {return mDetectedCharset;}
- const char* GetLanguage(int) {return NULL;}
+ const char* GetLanguage(int) {return mDetectedLang;}
nsProbingState GetState(void) {return mState;}
void Reset(void);
float GetConfidence(int){return (float)0.99;}
@@ -67,6 +67,7 @@ protected:
PRUint32 mActiveSM;
nsProbingState mState;
const char * mDetectedCharset;
+ const char * mDetectedLang;
};
#endif /* nsEscCharSetProber_h__ */
diff --git a/src/nsEscSM.cpp b/src/nsEscSM.cpp
index dcc252c..cf73c2f 100644
--- a/src/nsEscSM.cpp
+++ b/src/nsEscSM.cpp
@@ -264,4 +264,3 @@ const SMModel ISO2022KRSMModel = {
ISO2022KRCharLenTable,
"ISO-2022-KR",
};
-
diff --git a/src/nsUniversalDetector.cpp b/src/nsUniversalDetector.cpp
index 6695aff..b143d9f 100644
--- a/src/nsUniversalDetector.cpp
+++ b/src/nsUniversalDetector.cpp
@@ -248,6 +248,7 @@ nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
{
shortcutCharset = mEscCharSetProber->GetCharSetName(0);
shortcutConfidence = mEscCharSetProber->GetConfidence(0);
+ shortcutLanguage = mEscCharSetProber->GetLanguage(0);
mDone = PR_TRUE;
}
break;
@@ -313,7 +314,7 @@ void nsUniversalDetector::DataEnd()
* when finding them.
*/
mDone = PR_TRUE;
- Report(shortcutCharset, NULL, shortcutConfidence);
+ Report(shortcutCharset, shortcutLanguage, shortcutConfidence);
return;
}
diff --git a/src/nsUniversalDetector.h b/src/nsUniversalDetector.h
index a286ed9..e7df39b 100644
--- a/src/nsUniversalDetector.h
+++ b/src/nsUniversalDetector.h
@@ -81,6 +81,7 @@ protected:
PRBool mGotData;
char mLastChar;
const char * shortcutCharset;
+ const char * shortcutLanguage;
float shortcutConfidence;
PRInt32 mBestGuess;