Skip to content

Commit

Permalink
take out celery beat; refactor spam.py;added directory locking when i…
Browse files Browse the repository at this point in the history
…ndexing
  • Loading branch information
Natay committed Mar 16, 2021
1 parent 69a9c54 commit 4abdf06
Show file tree
Hide file tree
Showing 22 changed files with 217 additions and 139 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ reset: echo
# Delete the database, logs and CACHE files.
# Keep media and spooler.
rm -rf export/logs/*.log
#rm -rf export/spammers/
# Database is always found in export/db/
rm -f export/db/${DATABASE_NAME}
rm -rf export/static/CACHE
Expand Down
2 changes: 0 additions & 2 deletions biostar/celeryconf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@

app.conf.broker_url = 'redis://127.0.0.1:6379'

app.conf.beat_schedule = settings.BEAT_TASKS

# Discover tasks in applications.
app.autodiscover_tasks(
lambda: settings.TASK_MODULES
Expand Down
4 changes: 2 additions & 2 deletions biostar/forum/auth.py
Original file line number Diff line number Diff line change
Expand Up @@ -496,8 +496,8 @@ def toggle_spam(request, post, **kwargs):
text = f'Restored {post_link(post)} from spam'
else:
text = f'Marked {post_link(post)} as spam'
# Set indexed flag to True, so it's skipped in the next round.
Post.objects.filter(id=post.id).update(indexed=True)
# Set indexed flag to False, so it's removed from spam index
Post.objects.filter(id=post.id).update(indexed=False)

# Set a logging message.
messages.success(request, mark_safe(text))
Expand Down
52 changes: 0 additions & 52 deletions biostar/forum/celeryconf.py

This file was deleted.

67 changes: 40 additions & 27 deletions biostar/forum/management/commands/index.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@

import logging
from typing import Any

import os, sys
from django.core.management.base import BaseCommand
from biostar.forum.models import Post
from django.conf import settings
from biostar.forum import search, spam
from biostar.utils.decorators import check_lock

logger = logging.getLogger('engine')

LOCK = os.path.join(settings.INDEX_DIR, 'flag')


def handle_spam(posts):
"""
Expand All @@ -23,6 +26,41 @@ def handle_spam(posts):
spam.add_spam(post=post)


@check_lock(LOCK)
def build(size, remove=False):
"""
Builds search index
"""
posts = Post.objects.valid_posts(indexed=False).exclude(root=None)[:size]
target_count = len(posts)

# The list of posts to update
ids = [post.id for post in posts]

# Add post to search index.
search.index_posts(posts=posts, overwrite=remove)

# Set the indexed field to true.
Post.objects.filter(id__in=ids).update(indexed=True)

count = Post.objects.valid_posts(indexed=False).exclude(root=None).count()

logger.info(f"Indexed {target_count} posts, {count} unindexed posts remaining")

# Take spam posts that have been indexed and remove.
spam_posts = Post.objects.filter(spam=Post.SPAM, indexed=False)[:size]
sids = [post.id for post in spam_posts]

# Remove spam post.
handle_spam(posts=spam_posts)

# Update the spam indexed flag.
Post.objects.filter(id__in=sids).update(indexed=True)

# Add to spam index
logger.info(f"Removed {len(sids)} spam posts from index")


class Command(BaseCommand):
help = 'Create search index for the forum app.'

Expand All @@ -32,7 +70,6 @@ def add_arguments(self, parser):
parser.add_argument('--remove', action='store_true', default=False, help="Removes the existing index.")
parser.add_argument('--report', action='store_true', default=False, help="Reports on the content of the index.")
parser.add_argument('--size', type=int, default=0, help="How many posts to index")
#parser.add_argument('--clear_spam', action='store_true', default=False, help="Clear search index of spam posts.")

def handle(self, *args, **options):

Expand All @@ -50,31 +87,7 @@ def handle(self, *args, **options):

# Index a limited number yet unindexed posts
if size:

posts = Post.objects.valid_posts(indexed=False).exclude(root=None)[:size]
target_count = len(posts)

# The list of posts to update
ids = [ post.id for post in posts ]

# Add post to search index.
search.index_posts(posts=posts, overwrite=remove)

# Set the indexed field to true.
Post.objects.filter(id__in=ids).update(indexed=True)

count = Post.objects.valid_posts(indexed=False).exclude(root=None).count()

logger.info(f"Indexed {target_count} posts, {count} unindexed posts remaining")

# Take spam posts that have been indexed and remove.
spam_posts = Post.objects.filter(spam=Post.SPAM, indexed=True)[:size]

# Remove spam post from index, if present.
handle_spam(posts=spam_posts)

# Add to spam index
logger.info(f"Removed {spam.count()} spam posts, from index")
build(size=size, remove=remove)

# Report the contents of the index
if report:
Expand Down
4 changes: 2 additions & 2 deletions biostar/forum/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
# Inherit from the accounts settings file.
from biostar.planet.settings import *

from .celeryconf import *

def join(*args):
return os.path.abspath(os.path.join(*args))

Expand Down Expand Up @@ -160,6 +158,8 @@ def join(*args):
}
}

TASK_MODULES = ("biostar.forum.tasks", )

# Tries to load up secret settings from a predetermined module
# This is for convenience only!
try:
Expand Down
40 changes: 19 additions & 21 deletions biostar/forum/spam.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import shutil
import random
import time
from collections import defaultdict
from math import log, exp
from itertools import groupby, islice, count, chain
from django.conf import settings
Expand Down Expand Up @@ -131,35 +132,33 @@ def search_spam(post, ix):
Returns
"""

# Add this post to index to perform search.
writer = BufferedWriter(ix)
# Add post to index to more_like_this perform search.
writer = AsyncWriter(ix)
add_post_to_index(post=post, writer=writer, is_spam=post.is_spam)
writer.commit()
writer.close()

searcher = ix.searcher()
docnum = searcher.document_number(uid=post.uid)
#writer.commit()

# Search for this post in the spam index
# fields = ['uid']
#fields = ['uid']
# More like this
#results = search.preform_whoosh_search(ix=ix, query=post.uid, fields=fields)
print("P")
#time.sleep(1)
fields = ['uid']
results = search.preform_whoosh_search(ix=ix, query=post.uid, fields=fields)

# Preform more_like_this on this posts content
similar_content = searcher.more_like(docnum, 'content')
print("L")
#print(similar_content)
similar_content = results[0].more_like_this('content', top=5)

# Remove post after finding similar content
#similar_content = []
# # Remove this post from the spam index after results are collected.
#writer.delete_document(docnum)
#writer.commit()
# writer.delete_document(docnum)
# writer.commit()
writer = AsyncWriter(ix)
writer.delete_by_term('uid', text=post.uid)
writer.commit()

# Get the results into a list and close the searcher object.
similar_content = list(map(search.normalize_result, similar_content))
#print(similar_content)
searcher.close()
#print(similar_content, results, post.uid)
# searcher.close()
results.searcher.close()

return similar_content

Expand Down Expand Up @@ -208,7 +207,6 @@ def compute_score(post, ix=None):
def score(post, threshold=None):
"""
"""

if not settings.CLASSIFY_SPAM:
return

Expand All @@ -230,4 +228,4 @@ def score(post, threshold=None):
Post.objects.filter(id=post.id).update(spam=Post.SPAM)
auth.db_logger(text=f"auto marked spam :{auth.post_link(post)} spam score={spam_score}")

return spam_score
return spam_score
6 changes: 0 additions & 6 deletions biostar/recipes/celeryconf.py

This file was deleted.

2 changes: 2 additions & 0 deletions biostar/recipes/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
# Valid options; block, disable, threaded, uwsgi, celery.
TASK_RUNNER = 'threaded'

TASK_MODULES = ("biostar.recipes.tasks",)

PAGEDOWN_IMAGE_UPLOAD_ENABLED = True

# Amount of objects shown per page.
Expand Down
37 changes: 36 additions & 1 deletion biostar/utils/decorators.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import logging, functools, time
import logging, functools, time, os
from functools import partial
from django.conf import settings
from django.shortcuts import redirect
Expand All @@ -20,6 +20,40 @@ def inner(request, **kwargs):
return inner


def check_lock(lock):
"""
Check if lock directory exists before making the
"""

def __inner(func):

def __wrapper(*args, **kwargs):

if os.path.isdir(lock):
logger.error('Lock directory detected, task is already running')
sys.exit()

# Make the lock directory
os.makedirs(lock, exist_ok=True)

# Try to run function
try:
out = func(*args, **kwargs)
except Exception as exc:
logger.error(exc)
out = None

# Delete the lock directory
os.rmdir(lock)

# Return function output
return out

return __wrapper

return __inner


def d_timer():
"""
Return disabled timer.
Expand Down Expand Up @@ -201,6 +235,7 @@ def b_worker():
def outer(func, *args, **kwargs):
@functools.wraps(func)
def inner(*args, **kwargs):
#logger.info(f"running f{func} {args} {kwargs}")
return func(*args, **kwargs)

inner.spool = inner
Expand Down
25 changes: 25 additions & 0 deletions conf/scripts/forum/backup-daily.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Default database backup script.

# Stop on errors.
set -ue

# Load the conda commands.
source ~/miniconda3/etc/profile.d/conda.sh

export POSTGRES_HOST=/var/run/postgresql

# Activate the conda environemnt.
conda activate engine

USER=www

# Set the configuration module.
export DJANGO_SETTINGS_MODULE=conf.run.site_settings

# Backup location
mkdir -p export/backup

# pg_dump the database
python manage.py tasks --action pg_dump --outdir export/backup --user ${USER}
25 changes: 25 additions & 0 deletions conf/scripts/forum/backup-hourly.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/bin/bash

# Default database backup script.

# Stop on errors.
set -ue

# Load the conda commands.
source ~/miniconda3/etc/profile.d/conda.sh

export POSTGRES_HOST=/var/run/postgresql

# Activate the conda environemnt.
conda activate engine

USER=www

# Set the configuration module.
export DJANGO_SETTINGS_MODULE=conf.run.site_settings

# Backup location
mkdir -p export/backup

# pg_dump the database
python manage.py tasks --action pg_dump --outdir export/backup --user ${USER} --hourly
File renamed without changes.
Loading

0 comments on commit 4abdf06

Please sign in to comment.