Skip to content

Commit

Permalink
Merge pull request #66 from PeARSearch/minimalparts/restructuring
Browse files Browse the repository at this point in the history
Restructured app
  • Loading branch information
minimalparts authored Nov 2, 2024
2 parents 0149347 + 8b6fcaa commit dd4527a
Show file tree
Hide file tree
Showing 12 changed files with 289 additions and 333 deletions.
73 changes: 59 additions & 14 deletions app/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ def setup_logger(name, log_file, level=logging.INFO):
# Make sure user data directories exist
DEFAULT_PATH = dir_path
Path(path.join(DEFAULT_PATH,'userdata')).mkdir(parents=True, exist_ok=True)
Path(path.join(DEFAULT_PATH,'admindata')).mkdir(parents=True, exist_ok=True)
if getenv("SUGGESTIONS_DIR", "") != "":
Path(getenv("SUGGESTIONS_DIR")).mkdir(parents=True, exist_ok=True)

Expand Down Expand Up @@ -259,7 +258,8 @@ def serve_logos(path):
from flask_admin.contrib.sqla import ModelView
from app.api.models import Pods, Urls, User, Personalization
from app.api.controllers import return_pod_delete
from app.utils_db import delete_url_representations
from app.utils_db import delete_url_representations, delete_pod_representations, \
rm_from_npz, add_to_npz, create_pod_in_db, create_pod_npz_pos, rm_doc_from_pos, update_db_idvs_after_npz_delete

from flask_admin import expose
from flask_admin.contrib.sqla.view import ModelView
Expand Down Expand Up @@ -288,12 +288,11 @@ def is_accessible(self):

admin = Admin(app, name='PeARS DB', template_mode='bootstrap3', index_view=MyAdminIndexView())


class UrlsModelView(ModelView):
list_template = 'admin/pears_list.html'
column_exclude_list = ['vector','snippet','date_created','date_modified']
column_searchable_list = ['url', 'title', 'doctype', 'notes', 'pod']
column_editable_list = ['notes']
column_hide_backrefs = False
column_list = ['url', 'title', 'pod', 'notes']
column_searchable_list = ['url', 'title', 'pod', 'notes']
can_edit = True
page_size = 100
form_widget_args = {
Expand All @@ -303,12 +302,6 @@ class UrlsModelView(ModelView):
'url': {
'readonly': True
},
'pod': {
'readonly': True
},
'snippet': {
'readonly': True
},
'date_created': {
'readonly': True
},
Expand All @@ -326,15 +319,67 @@ def delete_model(self, model):
except Exception as ex:
if not self.handle_view_exception(ex):
flash(gettext('Failed to delete record. %(error)s', error=str(ex)), 'error')

self.session.rollback()

return False
else:
self.after_model_delete(model)

return True

def update_model(self, form, model):
"""
Update model from form.
"""
try:
# at this point model variable has the unmodified values
old_pod = model.pod
_, contributor = old_pod.split('.u.')
if '.u.' not in form.pod.data:
form.pod.data+='.u.'+contributor
new_pod = form.pod.data
new_theme = new_pod.split('.u.')[0]
p = db.session.query(Pods).filter_by(name=old_pod).first()
lang = p.language
form.populate_obj(model)

# at this point model variable has the form values
# your on_model_change is called
self._on_model_change(form, model, False)

# model is now being committed
self.session.commit()
except Exception as ex:
if not self.handle_view_exception(ex):
flash(gettext('Failed to update record. %(error)s', error=str(ex)), 'error')
self.session.rollback()
return False
else:
# model is now committed to the database
if old_pod != new_pod:
print(f"Pod name has changed from {old_pod} to {new_pod}!")
print("Move vector in npz file")
try:
pod_path = create_pod_npz_pos(contributor, new_theme, lang)
create_pod_in_db(contributor, new_theme, lang)
idv, v = rm_from_npz(model.vector, old_pod)
update_db_idvs_after_npz_delete(idv, old_pod)
add_to_npz(v, pod_path+'.npz')
#Removing from pos but not re-adding since current version does not make use of positional index. To fix.
rm_doc_from_pos(model.id, old_pod)
self.session.commit()
#If pod empty, delete
if len(db.session.query(Urls).filter_by(pod=old_pod).all()) == 0:
delete_pod_representations(old_pod)

except Exception as ex:
if not self.handle_view_exception(ex):
flash(gettext('Failed to update record. %(error)s', error=str(ex)), 'error')
self.session.rollback()
return False
self.after_model_change(form, model, False)
return True


class PodsModelView(ModelView):
list_template = 'admin/pears_list.html'
column_exclude_list = ['DS_vector','word_vector']
Expand Down
68 changes: 1 addition & 67 deletions app/api/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from app.indexer.vectorizer import scale
from app.search.controllers import get_search_results, prepare_gui_results
from app.search.score_pages import mk_podsum_matrix
from app.utils_db import load_idx_to_url, load_npz_to_idx, rm_from_idx_to_url, delete_pod_representations
from app.utils_db import delete_pod_representations

# Define the blueprint:
api = Blueprint('api', __name__, url_prefix='/api')
Expand Down Expand Up @@ -91,69 +91,3 @@ def return_specific_url():
internal_message=internal_message, searchform=SearchForm())


@api.route('/urls/delete', methods=["GET","POST"])
@login_required
@check_is_confirmed
def return_url_delete(path):
u = db.session.query(Urls).filter_by(url=path).first()
pod_name = u.pod
theme, contributor = pod_name.split('.u.')
pod = db.session.query(Pods).filter_by(name=pod_name).first()
lang = pod.language
vocab = models[lang]['vocab']

#Remove document from main .idx file
idx = delete_url_from_url_to_idx(path, contributor)

#Find out index of url
npz_to_idx, npz_to_idx_path = load_npz_to_idx(contributor, lang, theme)
print("NPZ_TO_IDX")
print(npz_to_idx)
j = npz_to_idx[1].index(idx)
vid = npz_to_idx[0].index(j)
print(theme, contributor)
print(path, vid, pod_name)

#Remove document row from .npz matrix
pod_m = load_npz(join(pod_dir, contributor, lang, pod_name+'.npz'))
print("pod_m",pod_m.shape)
print("vid",vid)
m1 = pod_m[:vid]
m2 = pod_m[vid+1:]
print("m1",m1.shape)
print("m2",m2.shape)
pod_m = vstack((m1,m2))
print("pod_m",pod_m.shape)
save_npz(join(pod_dir, contributor, lang, pod_name+'.npz'),pod_m)

#Remove document from .npz.idx mapping
new_npz = npz_to_idx[0][:j]+npz_to_idx[0][j+1:]
new_idx = npz_to_idx[1][:j]+npz_to_idx[1][j+1:]
npz_to_idx = [new_npz,new_idx]
joblib.dump(npz_to_idx, npz_to_idx_path)
print("NPZ_TO_IDX")
print(npz_to_idx)

#Remove doc from positional index
posindex = load_posix(contributor, lang, theme)
new_posindex = []
for token in vocab:
token_id = vocab[token]
tmp = {}
for doc_id, posidx in posindex[token_id].items():
if doc_id != str(vid):
tmp[doc_id] = posidx
new_posindex.append(tmp)
dump_posix(new_posindex, contributor, lang, theme)

#Delete from database
db.session.delete(u)
db.session.commit()

#If pod is now empty, delete it
print(pod_m.shape)
if pod_m.shape[0] == 1:
return_pod_delete(pod_name)
return "Deleted document with vector id"+str(vid)


4 changes: 4 additions & 0 deletions app/api/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ class Urls(Base):
title = db.Column(db.String(1000))
snippet = db.Column(db.String(1000))
doctype = db.Column(db.String(1000))
vector = db.Column(db.Integer)
pod = db.Column(db.String(1000))
notes = db.Column(db.String(1000))
img = db.Column(db.String(1000))
Expand All @@ -58,6 +59,7 @@ def __init__(self,
title=None,
snippet=None,
doctype=None,
vector=None,
pod=None,
notes=None,
img=None,
Expand All @@ -67,6 +69,7 @@ def __init__(self,
self.title = title
self.snippet = snippet
self.doctype = doctype
self.vector = vector
self.pod = pod
self.notes = notes
self.img = img
Expand All @@ -84,6 +87,7 @@ def serialize(self):
'title': self.title,
'snippet': self.snippet,
'doctype': self.doctype,
'vector': self.vector,
'pod': self.pod,
'notes': self.notes,
'img': self.img,
Expand Down
91 changes: 84 additions & 7 deletions app/cli/controllers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,26 +5,27 @@
import re
import requests
from shutil import copy2, copytree
from os.path import dirname, realpath, join
from os import remove
from os.path import dirname, realpath, join, exists
from os import getenv
from glob import glob
from datetime import datetime
from pathlib import Path
from random import shuffle
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import numpy as np
import joblib
from flask import Blueprint
import click
from werkzeug.security import generate_password_hash
from scipy.sparse import load_npz
from scipy.sparse import load_npz, save_npz, csr_matrix, vstack
from app.indexer.controllers import run_indexer_url, index_doc_from_cli
from app.indexer.access import request_url
from app.indexer.posix import load_posix
from app.indexer.htmlparser import extract_links
from app.orchard.mk_urls_file import get_reindexable_pod_for_admin
from app.utils_db import create_idx_to_url
from app import db, User, Urls, Pods
from app import db, User, Urls, Pods, VEC_SIZE

pears = Blueprint('pears', __name__)

Expand Down Expand Up @@ -160,7 +161,6 @@ def index(host_url, filepath):
users = User.query.all()
for user in users:
Path(join(pod_dir,user.username)).mkdir(parents=True, exist_ok=True)
create_idx_to_url(user.username)
run_indexer_url(filepath, host_url)


Expand Down Expand Up @@ -227,7 +227,7 @@ def index_wiki(folder, regex, lang, contributor, host_url):
- host_url: the domain of your instance, e.g. https://mypears.org.
'''
corpus_files = glob(join(folder, regex ,'*.doc.txt'))
corpus_files = glob(join(folder, f'*{regex}*', '*.doc.txt'))
for filepath in corpus_files:
print(f">>Processing {filepath}...")
with open(filepath, encoding='utf-8') as fin:
Expand All @@ -246,7 +246,8 @@ def index_wiki(folder, regex, lang, contributor, host_url):
elif "</doc" in l:
print(url,theme,title,doc[:30])
note = ""
index_doc_from_cli(title, doc, theme, lang, contributor, url, note, host_url)
if not title.startswith("Talk:"):
index_doc_from_cli(title, doc, theme, lang, contributor, url, note, host_url)
doc = ""
else:
doc+=l+' '
Expand Down Expand Up @@ -401,3 +402,79 @@ def check_pos_vs_npz_to_idx(pod, username, language):
print("\t\t> idx :", set(idx1))
print("\t\t> posix:", set(idx2))
return set(idx1), set(idx2)

#####################
# REBUILD FROM DB
#####################

@pears.cli.command('rebuildfromdb')
def rebuild_from_db():
idx_paths = []
pods = Pods.query.all()
for p in pods:
print(f"\n\n POD {p.name}")
m = np.zeros((1,VEC_SIZE))
m = csr_matrix(m)
try:
urls = db.session.query(Urls).filter_by(pod=p.name).all()
username = p.name.split('.u.')[1]
idx_path = join(pod_dir, username, username+'.idx')
if idx_path not in idx_paths:
idx_paths.append(idx_path)
idx_to_url = joblib.load(idx_path)
except:
print(f">> ERROR: CLI: REBUILD FROM DB: npz.idx for {p.name} is corrupted.")
print(f">> ERROR: CLI: REBUILD FROM DB: deleting pod {p.name} from the database.")
db.session.delete(p)
db.session.commit()
continue

try:
npz_idx_path = join(pod_dir, username, p.language, p.name+'.npz.idx')
npz_to_idx = joblib.load(npz_idx_path)
except:
print(f">> ERROR: CLI: REBUILD FROM DB: npz.idx for {p.name} does not exist.")
print(f">> ERROR: CLI: REBUILD FROM DB: deleting pod {p.name} from the database.")
db.session.delete(p)
db.session.commit()
continue

try:
npz_path = join(pod_dir, username, p.language, p.name+'.npz')
npz = load_npz(npz_path).toarray()
except:
print(f">> ERROR: CLI: REBUILD FROM DB: npz for {p.name} does not exist.")
print(f">> ERROR: CLI: REBUILD FROM DB: deleting pod {p.name} from the database.")
db.session.delete(p)
db.session.commit()
continue

for u in urls:
row = None
try:
k = idx_to_url[1].index(u.url)
idx = idx_to_url[0][k]
k = npz_to_idx[1].index(idx)
row = npz_to_idx[0][k]
v = npz[row]
except:
print(f">> ERROR: CLI: REBUILD FROM DB: matrix row not found for url {u.url}.")
print(f">> ERROR: CLI: REBUILD FROM DB: deleting url {u.url} from the database.")
db.session.delete(u)
db.session.commit()
continue
m = vstack((m,v))
u.vector = m.shape[0]-1
db.session.add(u)
db.session.commit()

# Clean up: save new npz and remove unused files
save_npz(npz_path, m)
remove(npz_idx_path)

pos_path = join(pod_dir, username, p.language, p.name+'.pos')
if exists(pos_path):
remove(pos_path)

for idx_path in idx_paths:
remove(idx_path)
Loading

0 comments on commit dd4527a

Please sign in to comment.