mirror of
https://github.com/aquatix/alfagok.git
synced 2025-12-06 21:05:10 +01:00
Use Wiktionary WikiWoordenboek list as source
This commit is contained in:
@@ -5,9 +5,16 @@ MAX_LENGTH = 10
|
|||||||
|
|
||||||
NUMBER_DAYS = 5 * 365
|
NUMBER_DAYS = 5 * 365
|
||||||
|
|
||||||
|
# Set to True if you want to use the big but difficult OpenTaal list
|
||||||
|
USE_OPENTAAL = False
|
||||||
|
|
||||||
|
with open('wikiwoordenboek_basiswoorden.lst', 'r', encoding='utf-8') as wordfile:
|
||||||
|
wikiwoorden_words = wordfile.readlines()
|
||||||
|
print(f'wikiwoorden basic list contains {len(wikiwoorden_words)} words')
|
||||||
|
|
||||||
with open('basiswoorden-gekeurd.txt', 'r', encoding='utf-8') as wordfile:
|
with open('basiswoorden-gekeurd.txt', 'r', encoding='utf-8') as wordfile:
|
||||||
basis_words = wordfile.readlines()
|
basis_words = wordfile.readlines()
|
||||||
print(f'original list contains {len(basis_words)} words')
|
print(f'opentaal basic list contains {len(basis_words)} words')
|
||||||
|
|
||||||
with open('flexies-ongekeurd.txt', 'r', encoding='utf-8') as wordfile:
|
with open('flexies-ongekeurd.txt', 'r', encoding='utf-8') as wordfile:
|
||||||
# Vervoegingen and such, see https://nl.wikipedia.org/wiki/Flexie_(taalkunde)
|
# Vervoegingen and such, see https://nl.wikipedia.org/wiki/Flexie_(taalkunde)
|
||||||
@@ -26,30 +33,43 @@ print()
|
|||||||
all_words_count = 0
|
all_words_count = 0
|
||||||
dictionary_list = []
|
dictionary_list = []
|
||||||
result_list = []
|
result_list = []
|
||||||
for word in basis_words + flexies_words:
|
for word in wikiwoorden_words + basis_words + flexies_words:
|
||||||
all_words_count += 1
|
all_words_count += 1
|
||||||
word = word.strip()
|
word = word.strip()
|
||||||
if word.isalpha() and word.lower() == word:
|
if word.isalpha() and word.lower() == word:
|
||||||
# Word is valid for our dictionary
|
# Word is valid for our dictionary
|
||||||
dictionary_list.append(f'{word}\n')
|
dictionary_list.append(f'{word}\n')
|
||||||
|
|
||||||
for word in basis_words:
|
# Deduplicate dictionary
|
||||||
|
dictionary_list = sorted(list(set(dictionary_list)), key=str.casefold)
|
||||||
|
|
||||||
|
if USE_OPENTAAL:
|
||||||
|
# Use basis_words if you want to use the big but difficult OpenTaal list
|
||||||
|
source_words = basis_words
|
||||||
|
else:
|
||||||
|
source_words = wikiwoorden_words
|
||||||
|
|
||||||
|
for word in source_words:
|
||||||
word = word.strip()
|
word = word.strip()
|
||||||
if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH:
|
if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH:
|
||||||
# Word is 'fit' for our game
|
# Word is 'fit' for our game
|
||||||
result_list.append(f'{word}\n')
|
result_list.append(f'{word}\n')
|
||||||
|
|
||||||
nl_set = set(result_list)
|
if USE_OPENTAAL:
|
||||||
en_set = set(english_words)
|
nl_set = set(result_list)
|
||||||
|
en_set = set(english_words)
|
||||||
|
|
||||||
# Only keep the words that are not found in the English set
|
# Only keep the words that are not found in the English set
|
||||||
filtered_set = nl_set.difference(en_set)
|
filtered_set = nl_set.difference(en_set)
|
||||||
filtered_list = sorted(list(filtered_set), key=str.casefold)
|
filtered_list = sorted(list(filtered_set), key=str.casefold)
|
||||||
|
else:
|
||||||
|
filtered_list = sorted(list(wikiwoorden_words), key=str.casefold)
|
||||||
|
|
||||||
print(f'words total: {all_words_count}')
|
print(f'words total: {all_words_count}')
|
||||||
print(f'words in dictionary: {len(dictionary_list)}')
|
print(f'words in dictionary: {len(dictionary_list)}')
|
||||||
print(f'words initially filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}')
|
print(f'words initially filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}')
|
||||||
print(f'words after filtering english: {len(filtered_list)}')
|
if USE_OPENTAAL:
|
||||||
|
print(f'words after filtering english: {len(filtered_list)}')
|
||||||
|
|
||||||
with open('filtered.txt', 'w', encoding='utf-8') as f:
|
with open('filtered.txt', 'w', encoding='utf-8') as f:
|
||||||
f.writelines(filtered_list)
|
f.writelines(filtered_list)
|
||||||
@@ -63,7 +83,7 @@ selection_list = []
|
|||||||
while len(selection_list) < NUMBER_DAYS:
|
while len(selection_list) < NUMBER_DAYS:
|
||||||
# Use index - 1 because lists start at index 0
|
# Use index - 1 because lists start at index 0
|
||||||
word_index = random.randrange(0, len(filtered_list) - 1)
|
word_index = random.randrange(0, len(filtered_list) - 1)
|
||||||
selection_list.append(result_list[word_index])
|
selection_list.append(filtered_list[word_index])
|
||||||
|
|
||||||
# Save the result
|
# Save the result
|
||||||
with open('word_of_the_day.txt', 'w', encoding='utf-8') as f:
|
with open('word_of_the_day.txt', 'w', encoding='utf-8') as f:
|
||||||
|
|||||||
Reference in New Issue
Block a user