summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJehan <jehan@girinstud.io>2021-11-09 22:06:47 +0100
committerJehan <jehan@girinstud.io>2022-12-14 00:24:53 +0100
commit81b83fffa9b0fa044878fdd154f4e6adf9aa4e68 (patch)
tree0939cae26ed42fd35d1e989f8c4e5e860d5022ca
parenta3ff09bece9ee0e787d1964bda06c9ea341a8982 (diff)
script: work around recent issue of python wikipedia module.
Adding `auto_suggest=False` to the wikipedia.page() call because this auto-suggest is completely broken, searching "mar ot" instead of "marmot" or "ground hug" instead of "Groundhog" (this one is extra funny but not so useful!). I actually wonder why it even needs to suggest anything when the Wikipedia pages do actually exist! Anyway the script BuildLangModel.py was very broken because of this, now it's better. See: https://github.com/goldsmith/Wikipedia/issues/295 Also printing the error message when we discard a page, which helps debugging.
-rwxr-xr-xscript/BuildLangModel.py6
1 files changed, 3 insertions, 3 deletions
diff --git a/script/BuildLangModel.py b/script/BuildLangModel.py
index d4f315c..faf28bd 100755
--- a/script/BuildLangModel.py
+++ b/script/BuildLangModel.py
@@ -322,11 +322,11 @@ def visit_pages(titles, depth, lang, logfd):
visited_pages += [title]
try:
- page = wikipedia.page(title)
+ page = wikipedia.page(title, auto_suggest=False)
except (wikipedia.exceptions.PageError,
- wikipedia.exceptions.DisambiguationError):
+ wikipedia.exceptions.DisambiguationError) as error:
# Let's just discard a page when I get an exception.
- print("Discarding page {}.\n".format(title))
+ print("Discarding page {}: {}\n".format(title, error))
continue
logfd.write("\n{} (revision {})".format(title, page.revision_id))
logfd.flush()