Skip to content

Commit

Permalink
OPEN 2790, 2902: Remove temporary CSV loading fix and add support for…
Browse files Browse the repository at this point in the history
… NLTK feature
  • Loading branch information
thriuin committed Nov 7, 2023
1 parent 5086fc4 commit 5956936
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 7 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -161,5 +161,9 @@ cython_debug/
!/search/management/commands/contracts_*.py
!/search/management/commands/nap5_*.py
!/search/management/commands/qpnotes_*.py
/nltk/corpora
/nltk/tokenizers
/nltk/*.model
!/
# Don't include CSV files from the cache
!/cache/*.csv
1 change: 1 addition & 0 deletions nltk/NLTK.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Install the NLTK Stopworks and Punkt tokenizer here
1 change: 1 addition & 0 deletions oc_search/settings-sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,4 @@
IMPORT_DATA_CSV_SOLR_INDEX_GROUP_SIZE = 10
IMPORT_DATA_CSV_BAD_DATA_DIR = os.path.join(BASE_DIR, 'bad_data')

NLTK_DATADIR = os.path.join(BASE_DIR, 'nltk')
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,11 @@ inflection==0.5.1
markdown2==2.4.10
nltk==3.8.1
pandas==1.4.4
psycopg2==2.9.7
psycopg2==2.9.9
python-dateutil==2.8.2
PyYAML==6.0.1
Unidecode==1.3.6
scikit-learn==1.3.2
Unidecode==1.3.7
uWSGI==2.0.19.1; sys_platform == 'linux'
#git+https://github.com/thriuin/SolrClient.git@master

Expand Down
5 changes: 0 additions & 5 deletions search/management/commands/import_data_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,11 +376,6 @@ def handle(self, *args, **options):
bd_writer.writeheader()
bd_file.flush()

# Temporary fix for connection blocking issue OPEN 2902
for solr_field in solr_record:
if type(solr_record[solr_field]) == str:
solr_record[solr_field] = str(solr_record[solr_field]).replace(" echo ", " ẹcho ")

solr_items.append(solr_record)
total += 1

Expand Down
18 changes: 18 additions & 0 deletions search/management/commands/nltk_download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from django.core.management.base import BaseCommand
from django.conf import settings
import logging
import nltk


class Command(BaseCommand):
help = 'Download the NLTK Stopwords and punkt tokenizer to a project folder'

logger = logging.getLogger(__name__)

def handle(self, *args, **options):

try:
nltk.download('stopwords',download_dir=settings.NLTK_DATADIR)
nltk.download('punkt', download_dir=settings.NLTK_DATADIR)
except Exception as x:
self.logger.error(x)

0 comments on commit 5956936

Please sign in to comment.