From ff7e19062e31b00107a0a732c000a650a559e6ca Mon Sep 17 00:00:00 2001 From: Michiel Scholten Date: Thu, 7 Nov 2024 14:06:07 +0100 Subject: [PATCH] Include flexies, filter away (some?) English words --- wordlist/create_list.py | 85 +++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/wordlist/create_list.py b/wordlist/create_list.py index 1e41ad3..0949a2d 100644 --- a/wordlist/create_list.py +++ b/wordlist/create_list.py @@ -6,40 +6,67 @@ MAX_LENGTH = 10 NUMBER_DAYS = 5 * 365 with open('basiswoorden-gekeurd.txt', 'r', encoding='utf-8') as wordfile: - all_words = wordfile.readlines() - print(f'original list contains {len(all_words)} words') + basis_words = wordfile.readlines() + print(f'original list contains {len(basis_words)} words') - dictionary_list = [] - result_list = [] - for word in all_words: - word = word.strip() - if word.isalpha() and word.lower() == word: - # Word is valid for our dictionary - dictionary_list.append(f'{word}\n') - if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH: - # Word is 'fit' for our game - result_list.append(f'{word}\n') +with open('flexies-ongekeurd.txt', 'r', encoding='utf-8') as wordfile: + # Vervoegingen and such, see https://nl.wikipedia.org/wiki/Flexie_(taalkunde) + flexies_words = wordfile.readlines() + print(f'flexies list contains {len(flexies_words)} words') - # print(result_list) - print(f'words filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}') - print(f'words in dictionary: {len(dictionary_list)}') +with open('/usr/share/dict/american-english', 'r', encoding='utf-8') as wordfile: + # English words we want to filter from the list; don't really care if there's accidental overlap with Dutch words + english_words = wordfile.readlines() + print(f'english list contains {len(english_words)} words') - with open('filtered.txt', 'w', encoding='utf-8') as f: - f.writelines(result_list) +print() +print('merging and filtering...') +print() - with open('dictionary.txt', 'w', encoding='utf-8') as f: - f.writelines(dictionary_list) +all_words_count = 0 +dictionary_list = [] +result_list = [] +for word in basis_words + flexies_words: + all_words_count += 1 + word = word.strip() + if word.isalpha() and word.lower() == word: + # Word is valid for our dictionary + dictionary_list.append(f'{word}\n') - selection_list = [] +for word in basis_words: + word = word.strip() + if word.isalpha() and word.lower() == word and len(word) >= MIN_LENGTH and len(word) <= MAX_LENGTH: + # Word is 'fit' for our game + result_list.append(f'{word}\n') - # Randomly select words for each day - while len(selection_list) < NUMBER_DAYS: - # Use index - 1 because lists start at index 0 - word_index = random.randrange(0, len(result_list) - 1) - selection_list.append(result_list[word_index]) +nl_set = set(result_list) +en_set = set(english_words) - # Save the result - with open('word_of_the_day.txt', 'w', encoding='utf-8') as f: - f.writelines(selection_list) +# Only keep the words that are not found in the English set +filtered_set = nl_set.difference(en_set) +filtered_list = sorted(list(filtered_set), key=str.casefold) - print(f'done writing {len(selection_list)} random words, enjoy!') +print(f'words total: {all_words_count}') +print(f'words in dictionary: {len(dictionary_list)}') +print(f'words initially filtered: {len(result_list)} with length >= {MIN_LENGTH} and <= {MAX_LENGTH}') +print(f'words after filtering english: {len(filtered_list)}') + +with open('filtered.txt', 'w', encoding='utf-8') as f: + f.writelines(filtered_list) + +with open('dictionary.txt', 'w', encoding='utf-8') as f: + f.writelines(dictionary_list) + +selection_list = [] + +# Randomly select words for each day +while len(selection_list) < NUMBER_DAYS: + # Use index - 1 because lists start at index 0 + word_index = random.randrange(0, len(filtered_list) - 1) + selection_list.append(result_list[word_index]) + +# Save the result +with open('word_of_the_day.txt', 'w', encoding='utf-8') as f: + f.writelines(selection_list) + +print(f'done writing {len(selection_list)} random words, enjoy!')