Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add-image-hash-to-detect-rename-and-optimize-reindexing #143

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
24 changes: 20 additions & 4 deletions rclip/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ class NewImage(ImageOmittable):
modified_at: float
size: int
vector: bytes
hash: str


class Image(NewImage):
Expand All @@ -27,11 +28,20 @@ def __init__(self, filename: Union[str, pathlib.Path]):
self._con.row_factory = sqlite3.Row
self.ensure_tables()
self.ensure_version()
self._migrate_db()

def close(self):
self._con.commit()
self._con.close()

def _migrate_db(self):
try:
self._con.execute('ALTER TABLE images ADD COLUMN hash TEXT')
self._con.commit()
except sqlite3.OperationalError:
# Column already exists, skip
pass

def ensure_tables(self):
self._con.execute('''
CREATE TABLE IF NOT EXISTS images (
Expand All @@ -40,6 +50,7 @@ def ensure_tables(self):
filepath TEXT NOT NULL UNIQUE,
modified_at DATETIME NOT NULL,
size INTEGER NOT NULL,
hash TEXT NOT NULL,
vector BLOB NOT NULL
)
''')
Expand Down Expand Up @@ -74,14 +85,14 @@ def commit(self):

def upsert_image(self, image: NewImage, commit: bool = True):
self._con.execute('''
INSERT INTO images(deleted, indexing, filepath, modified_at, size, vector)
VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :vector)
INSERT INTO images(deleted, indexing, filepath, modified_at, size, hash, vector)
VALUES (:deleted, :indexing, :filepath, :modified_at, :size, :hash, :vector)
ON CONFLICT(filepath) DO UPDATE SET
deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, vector=:vector
deleted=:deleted, indexing=:indexing, modified_at=:modified_at, size=:size, hash=:hash, vector=:vector
''', {'deleted': None, 'indexing': None, **image})
if commit:
self._con.commit()

def remove_indexing_flag_from_all_images(self, commit: bool = True):
self._con.execute('UPDATE images SET indexing = NULL')
if commit:
Expand Down Expand Up @@ -113,3 +124,8 @@ def get_image_vectors_by_dir_path(self, path: str) -> sqlite3.Cursor:
return self._con.execute(
f'SELECT filepath, vector FROM images WHERE filepath LIKE ? AND deleted IS NULL', (path + f'{os.path.sep}%',)
)

def get_image_by_hash(self, hash: str) -> Optional[Image]:
row = self._con.execute('SELECT * FROM images WHERE hash = ?', (hash,))
row = row.fetchone()
return Image(**row) if row else None
113 changes: 65 additions & 48 deletions rclip/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tqdm import tqdm
import PIL
from PIL import Image, ImageFile
import imagehash

from rclip import db, fs, model
from rclip.utils.preview import preview
Expand All @@ -22,14 +23,19 @@
class ImageMeta(TypedDict):
modified_at: float
size: int
hash: str


PathMetaVector = Tuple[str, ImageMeta, model.FeatureVector]


def compute_image_hash(image_path: str) -> str:
with Image.open(image_path) as img:
return str(imagehash.average_hash(img))

def get_image_meta(entry: os.DirEntry) -> ImageMeta:
stat = entry.stat()
return ImageMeta(modified_at=stat.st_mtime, size=stat.st_size)
return ImageMeta(modified_at=stat.st_mtime,size=stat.st_size,hash=compute_image_hash(entry.path))


def is_image_meta_equal(image: db.Image, meta: ImageMeta) -> bool:
Expand Down Expand Up @@ -85,63 +91,74 @@ def _index_files(self, filepaths: List[str], metas: List[ImageMeta]):
filepath=path,
modified_at=meta['modified_at'],
size=meta['size'],
vector=vector.tobytes()
vector=vector.tobytes(),
hash=meta['hash'],
), commit=False)

def ensure_index(self, directory: str):
print(
'checking images in the current directory for changes;'
' use "--no-indexing" to skip this if no images were added, changed, or removed',
file=sys.stderr,
'checking images in the current directory for changes;'
' use "--no-indexing" to skip this if no images were added, changed, or removed',
file=sys.stderr,
)

self._db.remove_indexing_flag_from_all_images(commit=False)
self._db.flag_images_in_a_dir_as_indexing(directory, commit=True)

with tqdm(total=None, unit='images') as pbar:
def update_total_images(count: int):
pbar.total = count
pbar.refresh()
counter_thread = threading.Thread(
target=fs.count_files,
args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images),
)
counter_thread.start()

images_processed = 0
batch: List[str] = []
metas: List[ImageMeta] = []
for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX):
filepath = entry.path
image = self._db.get_image(filepath=filepath)
try:
meta = get_image_meta(entry)
except Exception as ex:
print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr)
continue

if not images_processed % self.DB_IMAGES_BEFORE_COMMIT:
self._db.commit()
images_processed += 1
pbar.update()

if image and is_image_meta_equal(image, meta):
self._db.remove_indexing_flag(filepath, commit=False)
continue

batch.append(filepath)
metas.append(meta)

if len(batch) >= self._indexing_batch_size:
self._index_files(batch, metas)
batch = []
metas = []

if len(batch) != 0:
self._index_files(batch, metas)

self._db.commit()
counter_thread.join()
def update_total_images(count: int):
pbar.total = count
pbar.refresh()
counter_thread = threading.Thread(
target=fs.count_files,
args=(directory, self._exclude_dir_regex, self.IMAGE_REGEX, update_total_images),
)
counter_thread.start()

images_processed = 0
batch: List[str] = []
metas: List[ImageMeta] = []
for entry in fs.walk(directory, self._exclude_dir_regex, self.IMAGE_REGEX):
filepath = entry.path
try:
meta = get_image_meta(entry)
except Exception as ex:
print(f'error getting fs metadata for {filepath}:', ex, file=sys.stderr)
continue

if not images_processed % self.DB_IMAGES_BEFORE_COMMIT:
self._db.commit()
images_processed += 1
pbar.update()

existing_image = self._db.get_image_by_hash(meta['hash'])

if existing_image:
if existing_image['filepath'] != filepath:
# Image was renamed, update the filepath
self._db.upsert_image(db.NewImage(
filepath=filepath,
modified_at=meta['modified_at'],
size=meta['size'],
hash=meta['hash'],
vector=existing_image['vector']
), commit=False)
self._db.remove_indexing_flag(filepath, commit=False)
continue

batch.append(filepath)
metas.append(meta)

if len(batch) >= self._indexing_batch_size:
self._index_files(batch, metas)
batch = []
metas = []

if len(batch) != 0:
self._index_files(batch, metas)

self._db.commit()
counter_thread.join()

self._db.flag_indexing_images_in_a_dir_as_deleted(directory)
print('', file=sys.stderr)
Expand Down