Added sorting that handles ÆØÅ better.

Some observed issues with AA vs Å.
Ondkloss · Apr 23, 2019 · 5a292c7 · 5a292c7
1 parent 81a7a5c
commit 5a292c7
Show file tree

Hide file tree

Showing 5 changed files with 8,044 additions and 8,028 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
-*/
+*/
+PyICU*
diff --git a/README.md b/README.md
@@ -6,6 +6,8 @@ Simple project to create a list of Norwegian words. To run:
 
 Example output (and working wordlist) is `wordlist_20190123_norsk_ordbank_nob_2005.txt` and `wordlist_20190123_norsk_ordbank_nno_2012.txt`.
 
+To handle sorting of ÆØÅ you can include the PyICU module. This also fixes some diacritic issues (EÉÈÊ), but might give some illogical results for AA sorted as Å.
+
 ## Source
 
 The bokmål source material is from [Norsk Ordbank in Norwegian Bokmål 2005](https://www.nb.no/sprakbanken/show?serial=oai%3Anb.no%3Asbr-5&lang=en), the 2019-02-20 update. It is released under the [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/).
@@ -20,4 +22,4 @@ The software in this repo is licensed under WTFPL and can be read in `LICENSE`.
 
 * The regex to remove proper nouns also removes several valid words as well.
 * One might evaluate also removing some additional special characters, for example `1`, `2`, `3`, `4` and `/`.
-* Å is sorted before Ø.
+* Must update source when replacing tar.gz because of date.
diff --git a/word_parser.py b/word_parser.py
@@ -68,13 +68,27 @@ def remove_word_starts_and_endings(lines):
 
 def remove_words_with_special_characters(lines):
     # might evaluate removing 1234/ as well
-    return filter_out_pattern(lines, r'^.*[\'\$%&\.\(\) ].*$')
+    return filter_out_pattern(lines, r'^.*[\'\$%&°\.\(\) ].*$')
 
 
 def remove_single_letter_words(lines):
     return filter_out_pattern(lines, r'^.{1}$')
 
 
+def sort_locale(lines):
+    try:
+        import PyICU
+    except ImportError:
+        PyICU = None
+
+    if PyICU:
+        collator = PyICU.Collator.createInstance(PyICU.Locale('nb_NO'))
+        return sorted(lines, key=collator.getSortKey)
+    else:
+        print("To get locale specific sorting (æøå) the PyICO module is required. Doing basic sort.")
+        return sorted(lines)
+
+
 def parse_into_wordlist(filename):
     # prepare content
     extract_tar('{}.tar.gz'.format(filename))
@@ -91,7 +105,7 @@ def parse_into_wordlist(filename):
     lines = remove_words_with_special_characters(lines)
     lines = remove_words_with_special_characters(lines)
     lines = remove_single_letter_words(lines)
-    lines = sorted(lines)
+    lines = sort_locale(lines)
 
     # persist result
     set_file_contents('wordlist_{}.txt'.format(filename), lines)
-Original file line number
+Diff line change
@@ -1 +1,2 @@
-    */
+    */
+    PyICU*