Include flexies, filter away (some?) English words

2026-06-21 02:07:36 +02:00 · 2024-11-07 14:06:07 +01:00
parent a81788d172
commit ff7e19062e
1 changed files with 56 additions and 29 deletions
@@ -6,40 +6,67 @@ MAX_LENGTH = 10
 NUMBER_DAYS = 5 * 365
 with open('basiswoorden-gekeurd.txt', 'r', encoding='utf-8') as wordfile:
-    all_words = wordfile.readlines()
+    basis_words = wordfile.readlines()
-    print(f'original list contains {len(all_words)} words')
+    print(f'original list contains {len(basis_words)} words')
-    dictionary_list = []
+with open('flexies-ongekeurd.txt', 'r', encoding='utf-8') as wordfile:
-    result_list = []
+    # Vervoegingen and such, see https://nl.wikipedia.org/wiki/Flexie_(taalkunde)
-    for word in all_words:
+    flexies_words = wordfile.readlines()
-        word = word.strip()
+    print(f'flexies list contains {len(flexies_words)} words')
        if word.isalpha() and word.lower() == word:
            # Word is valid for our dictionary
            dictionary_list.append(f'{word}\n')
        if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH:
            # Word is 'fit' for our game
            result_list.append(f'{word}\n')
-    # print(result_list)
+with open('/usr/share/dict/american-english', 'r', encoding='utf-8') as wordfile:
-    print(f'words filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}')
+    # English words we want to filter from the list; don't really care if there's accidental overlap with Dutch words
-    print(f'words in dictionary: {len(dictionary_list)}')
+    english_words = wordfile.readlines()
    print(f'english list contains {len(english_words)} words')
-    with open('filtered.txt', 'w', encoding='utf-8') as f:
+print()
-        f.writelines(result_list)
+print('merging and filtering...')
 print()
-    with open('dictionary.txt', 'w', encoding='utf-8') as f:
+all_words_count = 0
-        f.writelines(dictionary_list)
+dictionary_list = []
 result_list = []
 for word in basis_words + flexies_words:
    all_words_count += 1
    word = word.strip()
    if word.isalpha() and word.lower() == word:
        # Word is valid for our dictionary
        dictionary_list.append(f'{word}\n')
-    selection_list = []
+for word in basis_words:
    word = word.strip()
    if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH:
        # Word is 'fit' for our game
        result_list.append(f'{word}\n')
-    # Randomly select words for each day
+nl_set = set(result_list)
-    while len(selection_list) < NUMBER_DAYS:
+en_set = set(english_words)
        # Use index - 1 because lists start at index 0
        word_index = random.randrange(0, len(result_list) - 1)
        selection_list.append(result_list[word_index])
-    # Save the result
+# Only keep the words that are not found in the English set
-    with open('word_of_the_day.txt', 'w', encoding='utf-8') as f:
+filtered_set = nl_set.difference(en_set)
-        f.writelines(selection_list)
+filtered_list = sorted(list(filtered_set), key=str.casefold)
-    print(f'done writing {len(selection_list)} random words, enjoy!')
+print(f'words total: {all_words_count}')
 print(f'words in dictionary: {len(dictionary_list)}')
 print(f'words initially filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}')
 print(f'words after filtering english: {len(filtered_list)}')
 with open('filtered.txt', 'w', encoding='utf-8') as f:
    f.writelines(filtered_list)
 with open('dictionary.txt', 'w', encoding='utf-8') as f:
    f.writelines(dictionary_list)
 selection_list = []
 # Randomly select words for each day
 while len(selection_list) < NUMBER_DAYS:
    # Use index - 1 because lists start at index 0
    word_index = random.randrange(0, len(filtered_list) - 1)
    selection_list.append(result_list[word_index])
 # Save the result
 with open('word_of_the_day.txt', 'w', encoding='utf-8') as f:
    f.writelines(selection_list)
 print(f'done writing {len(selection_list)} random words, enjoy!')