Use Wiktionary WikiWoordenboek list as source

2026-06-21 09:07:35 +02:00 · 2024-11-07 15:33:17 +01:00
parent 5ca91ef955
commit 4afa475d63
1 changed files with 30 additions and 10 deletions
@@ -5,9 +5,16 @@ MAX_LENGTH = 10
 NUMBER_DAYS = 5 * 365
 # Set to True if you want to use the big but difficult OpenTaal list
 USE_OPENTAAL = False
 with open('wikiwoordenboek_basiswoorden.lst', 'r', encoding='utf-8') as wordfile:
    wikiwoorden_words = wordfile.readlines()
    print(f'wikiwoorden basic list contains {len(wikiwoorden_words)} words')
 with open('basiswoorden-gekeurd.txt', 'r', encoding='utf-8') as wordfile:
    basis_words = wordfile.readlines()
-    print(f'original list contains {len(basis_words)} words')
+    print(f'opentaal basic list contains {len(basis_words)} words')
 with open('flexies-ongekeurd.txt', 'r', encoding='utf-8') as wordfile:
    # Vervoegingen and such, see https://nl.wikipedia.org/wiki/Flexie_(taalkunde)
@@ -26,30 +33,43 @@ print()
 all_words_count = 0
 dictionary_list = []
 result_list = []
-for word in basis_words + flexies_words:
+for word in wikiwoorden_words + basis_words + flexies_words:
    all_words_count += 1
    word = word.strip()
    if word.isalpha() and word.lower() == word:
        # Word is valid for our dictionary
        dictionary_list.append(f'{word}\n')
-for word in basis_words:
+# Deduplicate dictionary
 dictionary_list = sorted(list(set(dictionary_list)), key=str.casefold)
 if USE_OPENTAAL:
    # Use basis_words if you want to use the big but difficult OpenTaal list
    source_words = basis_words
 else:
    source_words = wikiwoorden_words
 for word in source_words:
    word = word.strip()
    if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH:
        # Word is 'fit' for our game
        result_list.append(f'{word}\n')
-nl_set = set(result_list)
+if USE_OPENTAAL:
-en_set = set(english_words)
+    nl_set = set(result_list)
    en_set = set(english_words)
-# Only keep the words that are not found in the English set
+    # Only keep the words that are not found in the English set
-filtered_set = nl_set.difference(en_set)
+    filtered_set = nl_set.difference(en_set)
-filtered_list = sorted(list(filtered_set), key=str.casefold)
+    filtered_list = sorted(list(filtered_set), key=str.casefold)
 else:
    filtered_list = sorted(list(wikiwoorden_words), key=str.casefold)
 print(f'words total: {all_words_count}')
 print(f'words in dictionary: {len(dictionary_list)}')
 print(f'words initially filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}')
-print(f'words after filtering english: {len(filtered_list)}')
+if USE_OPENTAAL:
    print(f'words after filtering english: {len(filtered_list)}')
 with open('filtered.txt', 'w', encoding='utf-8') as f:
    f.writelines(filtered_list)
@@ -63,7 +83,7 @@ selection_list = []
 while len(selection_list) < NUMBER_DAYS:
    # Use index - 1 because lists start at index 0
    word_index = random.randrange(0, len(filtered_list) - 1)
-    selection_list.append(result_list[word_index])
+    selection_list.append(filtered_list[word_index])
 # Save the result
 with open('word_of_the_day.txt', 'w', encoding='utf-8') as f: