From 6090f9f672c9af24f0af0f928144fda206fa43fc Mon Sep 17 00:00:00 2001 From: piotrj Date: Fri, 1 Dec 2023 22:14:42 +0100 Subject: [PATCH] parallel serching in records (draft),other searching improvements --- src/core.py | 374 ++++++++++++++++++++++++++++++++++++++++++++------ src/librer.py | 75 ++++++++-- 2 files changed, 392 insertions(+), 57 deletions(-) diff --git a/src/core.py b/src/core.py index 8823bcb..fc7a60c 100644 --- a/src/core.py +++ b/src/core.py @@ -26,12 +26,17 @@ # #################################################################################### +from time import sleep +#from threading import Thread +from multiprocessing import Process, Manager + from zstandard import ZstdCompressor,ZstdDecompressor from zipfile import ZipFile from os import scandir,stat,sep from os import remove as os_remove +from os import cpu_count from os.path import abspath,normpath,basename from os.path import join as path_join @@ -444,7 +449,7 @@ def extract_customdata(self): returncode,output = result_tuple new_elem={} - new_elem['cd_ok']= True if returncode==0 else False + new_elem['cd_ok']= bool(returncode==0) if output not in customdata_helper: customdata_helper[output]=cd_index @@ -514,8 +519,6 @@ def extract_customdata(self): self.exe = None - search_kind_code_tab={'dont':0,'without':1,'any':2,'error':3,'regexp':4,'glob':5,'fuzzy':6} - ############################################################# def tupelize_rec(self,scan_like_data): entry_LUT_encode_loc = entry_LUT_encode @@ -656,57 +659,60 @@ def clone_record(self,file_path,keep_cd=True,keep_crc=True,compression_level=16) new_record.filestructure = self.clone_record_rec(self.customdata,self.filenames,self.filestructure,keep_cd,keep_crc) new_record.save(file_path,compression_level) + ######################################################################################## def find_items(self, + record_nr,managed_progress,managed_results,managed_results_len,managed_abort, size_min,size_max, filename_search_kind,name_func_to_call,cd_search_kind,cd_func_to_call): - self.decompress_filestructure() + self.find_results = [] + managed_results[record_nr] = [] - dont_kind_code = self.search_kind_code_tab['dont'] - regexp_kind_code = self.search_kind_code_tab['regexp'] - glob_kind_code = self.search_kind_code_tab['glob'] - without_kind_code = self.search_kind_code_tab['without'] - any_kind_code = self.search_kind_code_tab['any'] - error_kind_code = self.search_kind_code_tab['error'] - fuzzy_kind_code = self.search_kind_code_tab['fuzzy'] + self.decompress_filestructure() - find_results = self.find_results = [] - find_results_add = find_results.append + results = set() + results_add = results.add filenames_loc = self.filenames filestructure = self.filestructure self.files_search_progress = 0 - filename_search_kind_code = self.search_kind_code_tab[filename_search_kind] - cd_search_kind_code = self.search_kind_code_tab[cd_search_kind] - - if cd_search_kind_code!=dont_kind_code: + if cd_search_kind!='dont': self.decompress_customdata() entry_LUT_decode_loc = entry_LUT_decode - use_size = True if size_min or size_max else False + use_size = bool(size_min or size_max) search_list = [ (filestructure[4],[]) ] search_list_pop = search_list.pop search_list_append = search_list.append - rgf_group = (regexp_kind_code,glob_kind_code,fuzzy_kind_code) + cd_search_kind_is_regezp_glob_or_fuzzy = bool(cd_search_kind in ('regexp','glob','fuzzy')) + cd_search_kind_is_dont_or_without = bool(cd_search_kind in ('dont','without')) + + when_folder_may_apply = bool(cd_search_kind_is_dont_or_without and not use_size) + cd_search_kind_is_any = bool(cd_search_kind=='any') + cd_search_kind_is_without = bool(cd_search_kind=='without') + cd_search_kind_is_error = bool(cd_search_kind=='error') self_customdata = self.customdata + while search_list: - if self.abort_action: + if managed_abort[record_nr]: break filestructure,parent_path_components = search_list_pop() for data_entry in filestructure: - if self.abort_action: + if managed_abort[record_nr]: break self.files_search_progress +=1 + managed_progress[record_nr]=self.files_search_progress + name_nr,code,size,mtime = data_entry[0:4] name = filenames_loc[name_nr] @@ -724,18 +730,17 @@ def find_items(self, cd_nr = data_entry[elem_index] elem_index+=1 - if has_crc: - crc = data_entry[elem_index] - - cd_search_kind_code_is_rgf = True if cd_search_kind_code in rgf_group else False + #if has_crc: + # crc = data_entry[elem_index] next_level = parent_path_components + [name] if is_dir : - if cd_search_kind_code==dont_kind_code and not use_size: + if when_folder_may_apply: #katalog moze spelniac kryteria naazwy pliku ale nie ma rozmiaru i custom data if name_func_to_call: if name_func_to_call(name): - find_results_add( tuple([tuple(next_level),size,mtime]) ) + results_add( tuple([tuple(next_level),size,mtime]) ) + managed_results_len[record_nr]+=1 if sub_data: search_list_append( (sub_data,next_level) ) @@ -753,18 +758,21 @@ def find_items(self, continue if name_func_to_call: - func_res_code = name_func_to_call(name) - if not func_res_code: + try: + if not name_func_to_call(name): + continue + except Exception as e: + self.log.error('find_items(1):%s',str(e) ) continue #oczywistosc - #if cd_search_kind_code==dont_kind_code: + #if cd_search_kind=='dont': # pass - if cd_search_kind_code==any_kind_code: + if cd_search_kind_is_any: if not has_cd or not cd_ok: continue - elif cd_search_kind_code_is_rgf: + elif cd_search_kind_is_regezp_glob_or_fuzzy: if has_cd and cd_ok: cd_data = self_customdata[cd_nr] else: @@ -772,28 +780,30 @@ def find_items(self, if cd_func_to_call: try: - #cd_txt = cd_data - #self.get_cd_text(cd_data) - if not cd_func_to_call(cd_data): continue except Exception as e: - self.log.error('find_items_rec:%s',str(e) ) + self.log.error('find_items(2):%s',str(e) ) continue else: continue - elif cd_search_kind_code==without_kind_code: + elif cd_search_kind_is_without: if has_cd: continue - elif cd_search_kind_code==error_kind_code: + elif cd_search_kind_is_error: if has_cd: if cd_ok: continue else: continue - find_results_add( tuple([tuple(next_level),size,mtime ]) ) + results_add( tuple([tuple(next_level),size,mtime ]) ) + managed_results_len[record_nr]+=1 + + + managed_results[record_nr] = list(results) + #self.find_results def find_items_sort(self,what,reverse): if what=='data': @@ -975,6 +985,12 @@ def decompress_customdata(self): else: return False +def global_find_items(record,record_nr,managed_progress,managed_results,managed_results_len,managed_abort,size_min,size_max,find_filename_search_kind,name_func_to_call,find_cd_search_kind,cd_func_to_call): + record.find_items(record_nr,managed_progress,managed_results,managed_results_len,managed_abort, + size_min,size_max, + find_filename_search_kind,name_func_to_call, + find_cd_search_kind,cd_func_to_call) + ####################################################################### class LibrerCore: records = set() @@ -1107,7 +1123,7 @@ def find_results_clean(self): for record in self.records: record.find_results_clean() - def find_items_in_all_records(self, + def find_items_in_all_records_old(self, range_par, size_min,size_max, find_filename_search_kind,name_expr,name_case_sens, @@ -1128,7 +1144,7 @@ def find_items_in_all_records(self, else: name_func_to_call = lambda x : re_compile(translate(name_expr), IGNORECASE).match(x) elif find_filename_search_kind == 'fuzzy': - name_func_to_call = lambda x : True if SequenceMatcher(None, name_expr, x).ratio()>filename_fuzzy_threshold_float else False + name_func_to_call = lambda x : bool(SequenceMatcher(None, name_expr, x).ratio()>filename_fuzzy_threshold_float) else: name_func_to_call = None else: @@ -1146,7 +1162,7 @@ def find_items_in_all_records(self, else: cd_func_to_call = lambda x : re_compile(translate(cd_expr), IGNORECASE).match(x) elif find_cd_search_kind == 'fuzzy': - cd_func_to_call = lambda x : True if SequenceMatcher(None, name_expr, x).ratio()>cd_fuzzy_threshold_float else False + cd_func_to_call = lambda x : bool(SequenceMatcher(None, name_expr, x).ratio()>cd_fuzzy_threshold_float) else: cd_func_to_call = None else: @@ -1186,9 +1202,281 @@ def find_items_in_all_records(self, print(e) self.files_search_progress += record.header.quant_files - self.find_res_quant = len(record.find_results) + self.find_res_quant += len(record.find_results) + ############################################################ + + ######################################################################################################################## + def find_items_in_all_records(self, + range_par, + size_min,size_max, + find_filename_search_kind,name_expr,name_case_sens, + find_cd_search_kind,cd_expr,cd_case_sens, + filename_fuzzy_threshold,cd_fuzzy_threshold): + + self.find_results_clean() + + if name_expr: + filename_fuzzy_threshold_float=float(filename_fuzzy_threshold) if find_filename_search_kind == 'fuzzy' else 0 + + if find_filename_search_kind == 'regexp': + name_func_to_call = lambda x : search(name_expr,x) + elif find_filename_search_kind == 'glob': + if name_case_sens: + #name_func_to_call = lambda x : fnmatch(x,name_expr) + name_func_to_call = lambda x : re_compile(translate(name_expr)).match(x) + else: + name_func_to_call = lambda x : re_compile(translate(name_expr), IGNORECASE).match(x) + elif find_filename_search_kind == 'fuzzy': + name_func_to_call = lambda x : bool(SequenceMatcher(None, name_expr, x).ratio()>filename_fuzzy_threshold_float) + else: + name_func_to_call = None + else: + name_func_to_call = None + + if cd_expr: + cd_fuzzy_threshold_float = float(cd_fuzzy_threshold) if find_cd_search_kind == 'fuzzy' else 0 + + if find_cd_search_kind == 'regexp': + cd_func_to_call = lambda x : search(cd_expr,x) + elif find_cd_search_kind == 'glob': + if cd_case_sens: + #cd_func_to_call = lambda x : fnmatch(x,cd_expr) + cd_func_to_call = lambda x : re_compile(translate(cd_expr)).match(x) + else: + cd_func_to_call = lambda x : re_compile(translate(cd_expr), IGNORECASE).match(x) + elif find_cd_search_kind == 'fuzzy': + cd_func_to_call = lambda x : bool(SequenceMatcher(None, name_expr, x).ratio()>cd_fuzzy_threshold_float) + else: + cd_func_to_call = None + else: + cd_func_to_call = None + + self.find_res_quant = 0 + records_to_process = [range_par] if range_par else list(self.records) + + records_to_process.sort(reverse = True,key = lambda x : x.header.quant_files) + #for record in records_to_process: + # print(record.header.label,'\t',record.header.quant_files) + + + self.files_search_progress = 0 + + self.search_record_nr=0 + #self.search_record_ref=None + + #records_len = len(self.records) + + self.abort_action = False + for record in records_to_process: + record.abort_action = False + ############################################################ + ############################################################ + + max_processes = cpu_count() + #max_processes = 8 + + manager = Manager() + + managed_results = manager.dict() + managed_results_len = manager.dict() + managed_progress = manager.dict() + managed_abort = manager.dict() + for record_nr,record in enumerate(records_to_process): + managed_results[record_nr]=[] + managed_results_len[record_nr]=0 + managed_progress[record_nr]=0 + managed_abort[record_nr]=False + + self.info_line = 'Initializing subprocesses ...' + jobs = {} + records_to_process_len = len(records_to_process) + for record_nr,record in enumerate(records_to_process): + subprocess = Process(target=global_find_items, args=(record,record_nr,managed_progress,managed_results,managed_results_len,managed_abort,size_min,size_max, + find_filename_search_kind,name_func_to_call, + find_cd_search_kind,cd_func_to_call)) + + jobs[record_nr] = [False,subprocess] + + self.info_line = 'subprocesses run.' + + ##################################################### + while True: + if self.abort_action: + break + + need_to_run = [ record_nr for record_nr in range(records_to_process_len) if jobs[record_nr][0]==False ] + need_to_run_len = len(need_to_run) + + running = len([record_nr for record_nr in range(records_to_process_len) if jobs[record_nr][0]==True and jobs[record_nr][1].is_alive() ]) + + self.search_record_nr = records_to_process_len-running-need_to_run_len + self.records_perc_info = self.search_record_nr * 100.0 / records_to_process_len + + self.info_line = f'Running threads: {running}' + + #done = records_to_process_len-running-need_to_run_len + #print(f'{need_to_run_len=} {running=}') + + if need_to_run: + if running", lambda event : self.finder_wrapper_show() ) + self.status_find_tooltip = lambda message : self.widget_tooltip(self.status_find,message) + + self.status_find_tooltip_default = 'No search results\nClick to open find dialog.' + self.status_find_tooltip(self.status_find_tooltip_default) + + ############################################################################## tree = self.tree = Treeview(self_main,takefocus=True,show=('tree','headings') ) self.tree_set = tree.set @@ -1489,7 +1499,7 @@ def file_cascade_post(): self_file_cascade_add_separator() self_file_cascade_add_command(label = 'Find ...',command = self.finder_wrapper_show, accelerator="Ctrl+F",image = self.ico_find,compound='left',state = 'normal' if self.sel_item is not None and self.current_record else 'disabled') self_file_cascade_add_separator() - self_file_cascade_add_command(label = 'Clear Find Results',command = self.find_clear, image = self.ico_empty,compound='left') + self_file_cascade_add_command(label = 'Clear Search Results',command = self.find_clear, image = self.ico_empty,compound='left',state = 'normal' if self.any_find_result else 'disabled') self_file_cascade_add_separator() #self_file_cascade_add_command(label = 'Save CSV',command = self.csv_save,state=item_actions_state,image = self_ico['empty'],compound='left') #self_file_cascade_add_separator() @@ -1710,6 +1720,8 @@ def help_cascade_post(): self_main_bind('', lambda event : self.show_customdata()) self_main_bind('', lambda event : self.delete_data_record()) + self_main_bind('', lambda event : self.find_next()) + self_main_bind('', lambda event : self.find_prev()) self_main.mainloop() @@ -1954,17 +1966,48 @@ def finder_wrapper_show(self): def find_close(self): self.find_dialog.hide() + @restore_status_line + @block_actions_processing + @gui_block def find_clear(self): - print('find_clear') + self.status('Cleaning search results ...') + + librer_core.find_results_clean() + + self_tree_get_children = self.tree.get_children + nodes_set = set(self_tree_get_children()) + nodes_set_pop = nodes_set.pop + nodes_set_add = nodes_set.add + self_tree_item = self.tree.item + self_FOUND = self.FOUND + + while nodes_set: + item=nodes_set_pop() + + tags=self_tree_item(item,'tags') + if self_FOUND in tags: + tags = set(tags) + tags.remove(self_FOUND) + self_tree_item(item,tags=tags) + + _ = {nodes_set_add(child) for child in self_tree_get_children(item)} + self.status_find_tooltip(self.status_find_tooltip_default) + self.any_find_result = False + @restore_status_line + @block_actions_processing + @gui_block def find_prev(self): if not self.any_find_result: self.finder_wrapper_show() else: self.select_find_result(-1) + @restore_status_line + @block_actions_processing + @gui_block def find_next(self): if not self.any_find_result: self.finder_wrapper_show() @@ -2245,18 +2288,25 @@ def find_items(self): fnumber_librer_core_files_search_quant = fnumber(librer_core_files_search_quant) fnumber_records_len = fnumber(records_len) + time_without_busy_sign=0 while search_thread_is_alive(): now=time() ###################################################################################### change0 = self_progress_dialog_on_find_update_lab_text(0,librer_core.info_line) if now>last_res_check+1: - change3 = self_progress_dialog_on_find_update_lab_text(3,'Found Files: ' + fnumber(librer_core.find_res_quant + len(librer_core.search_record_ref.find_results)) ) + #change3 = self_progress_dialog_on_find_update_lab_text(3,f'Found Files: {fnumber(librer_core.find_res_quant)} ({fnumber(len(librer_core.search_record_ref.find_results))})' ) + change3 = self_progress_dialog_on_find_update_lab_text(3,f'Found Files: {fnumber(librer_core.find_res_quant)}' ) last_res_check=now else: change3 = False - curr_files = librer_core.files_search_progress + librer_core.search_record_ref.files_search_progress + curr_files = librer_core.files_search_progress + #if librer_core.search_record_ref: + #+ librer_core.search_record_ref.files_search_progress + #else: + # curr_files = 0 + files_perc = curr_files * 100.0 / librer_core_files_search_quant self_progress_dialog_on_find_progr1var_set(librer_core.records_perc_info) @@ -2267,7 +2317,7 @@ def find_items(self): if self.action_abort: librer_core.abort() - librer_core.search_record_ref.abort() + #librer_core.search_record_ref.abort() break if change0 or change3 or prev_curr_files != curr_files: @@ -2307,7 +2357,9 @@ def find_items(self): abort_info = '\nSearching aborted. Resuls may be incomplete.' if self.action_abort else '' - self.info_dialog_on_find.show('Search sesults',f'found: {fnumber(find_results_quant_sum)} items.' + abort_info) + find_results_quant_sum_format = fnumber(find_results_quant_sum) + self.info_dialog_on_find.show('Search results',f'found: {find_results_quant_sum_format} items.' + abort_info) + self.status_find_tooltip(f"available search results: {find_results_quant_sum_format}") if self.action_abort: self.searching_aborted = True @@ -2505,13 +2557,7 @@ def key_press(self,event): ctrl_pressed = 'Control' in event_str shift_pressed = 'Shift' in event_str - if key=='F3': - if shift_pressed: - self.find_prev() - else: - self.find_next() - - elif key=='BackSpace': + if key=='BackSpace': pass elif key in ('c','C'): if ctrl_pressed: @@ -2687,7 +2733,8 @@ def context_menu_show(self,event): pop_add_command(label = 'Find next',command = self.find_next,accelerator="F3",state = 'normal' if self.sel_item is not None else 'disabled', image = self.ico_empty,compound='left') pop_add_command(label = 'Find prev',command = self.find_prev,accelerator="Shift+F3",state = 'normal' if self.sel_item is not None else 'disabled', image = self.ico_empty,compound='left') pop_add_separator() - pop_add_command(label = 'Clear Find Results',command = self.find_clear, image = self.ico_empty,compound='left') + pop_add_command(label = 'Clear Search Results',command = self.find_clear, image = self.ico_empty,compound='left',state = 'normal' if self.any_find_result else 'disabled') + pop_add_separator() pop_add_command(label = 'Exit', command = self.exit ,image = self.ico['exit'],compound='left')