From 96ca4d15d1515aac2ecf1b2ff36cea0dc8fd8057 Mon Sep 17 00:00:00 2001 From: Aploium Date: Sun, 15 May 2016 11:17:38 +0800 Subject: [PATCH] Automatic Domains Whitelist (Experimental) --- EasyWebsiteMirror.py | 44 +++++++++++++++++++++++++++++++++++- config.sample.py | 30 ++++++++++++++++++------ tests/regex_rewriter_test.py | 6 ++++- 3 files changed, 71 insertions(+), 9 deletions(-) diff --git a/EasyWebsiteMirror.py b/EasyWebsiteMirror.py index b89a008..031c919 100644 --- a/EasyWebsiteMirror.py +++ b/EasyWebsiteMirror.py @@ -11,6 +11,7 @@ import zlib import gzip from time import time +from fnmatch import fnmatch from html import escape as html_escape import threading from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus @@ -44,7 +45,7 @@ errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.') local_cache_enable = False -__VERSION__ = '0.18.6-dev' +__VERSION__ = '0.19.0-dev' __author__ = 'Aploium ' # ########## Basic Init ############# @@ -91,6 +92,9 @@ if not isinstance(target_static_domains, set): target_static_domains = set() +if not enable_automatic_domains_whitelist: + domains_whitelist_auto_add_glob_list = tuple() + if not enable_individual_sites_isolation: isolated_domains = set() else: @@ -210,6 +214,40 @@ # # ########## Begin Utils ############# +@lru_cache(maxsize=8192) +def is_domain_match_glob_whitelist(domain): + for domain_glob in domains_whitelist_auto_add_glob_list: + if fnmatch(domain, domain_glob): + return True + return False + + +def try_match_and_add_domain_to_rewrite_white_list(domain): + if domain is None or not domain: + return False + if domain in external_domains_set or domain == target_domain: + return True + if not is_domain_match_glob_whitelist(domain): + return False + else: + infoprint('A domain:', domain, 'was added to whitelist') + + global external_domains, external_domains_set, allowed_domains_set + _buff = list(external_domains) + _buff.append(domain) + external_domains = tuple(_buff) + external_domains_set.add(domain) + allowed_domains_set.add(domain) + + # write log + try: + with open('automatic_domains_whitelist.log', 'a', encoding='utf-8') as fp: + fp.write(domain + '\n') + except: + traceback.print_exc() + + return True + def current_line_number(): """Returns the current line number in our program.""" @@ -580,6 +618,10 @@ def regex_url_reassemble(match_obj): # dbgprint('returned_un_touch', whole_match_string) return whole_match_string + # v0.19.0+ Automatic Domains Whitelist (Experimental) + if enable_automatic_domains_whitelist: + try_match_and_add_domain_to_rewrite_white_list(match_domain) + remote_domain, _is_remote_https, remote_path = extract_real_domain_from_url_may_have_extdomains() # dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5) # dbgprint(match_obj.groups(), v=5) diff --git a/config.sample.py b/config.sample.py index f8508a8..f286063 100644 --- a/config.sample.py +++ b/config.sample.py @@ -73,6 +73,10 @@ https='https://127.0.0.1:8123', ) +# ############## Output Settings ############## +# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner) +verbose_level = 3 + # ############## Misc Settings ############## # v0.18.4+ for some modern websites (google/wiki, etc), we can assume it well always use utf-8 encoding. # or for some old-styled sites, we could also force the program to use gbk encoding (just for example) @@ -84,18 +88,30 @@ # 设置为 None 表示关闭显式编码指定, 'utf-8' 代表utf-8 force_decode_remote_using_encode = None -# v0.18.5+ -# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'} -# must be lower case -custom_allowed_remote_headers = {} +# v0.19.0+ Automatic Domains Whitelist (Experimental) +# by given wild match domains (glob syntax, '*.example.com'), if we got domains match these cases, +# it would be automatically added to the `external_domains` +# However, before you restart your server, you should check the 'automatic_domains_whitelist.log' file, +# and manually add domains to the config, or it would not work after you restart your server +# You CANNOT relay on the automatic whitelist, because the basic (but important) rewrite require specified domains to work. +# For More Supported Pattern Please See: https://docs.python.org/3/library/fnmatch.html#module-fnmatch +# 如果给定以通配符形式的域名, 当程序遇到匹配的域名时, 将会自动加入到 `external_domains` 的列表中 +# 但是, 当你重启服务器程序前, 请检查程序目录下 'automatic_domains_whitelist.log' 文件, +# 并将里面的域名手动添加到 `external_domains` 的列表中 (因为程序不会在运行时修改本配置文件) +# 自动域名添加白名单功能并不能取代 `external_domains` 中一个个指定的域名, +# 因为基础重写(很重要)不支持使用通配符(否则会带来10倍以上的性能下降). +# 如果需要使用 * 以外的通配符, 请查看 https://docs.python.org/3/library/fnmatch.html#module-fnmatch 这里的的说明 +enable_automatic_domains_whitelist = True +domains_whitelist_auto_add_glob_list = ('*.google.com', '*.gstatic.com', '*.google.com.hk') # ##################################################### # ################# ADVANCED Settings ################# # ##################################################### -# ############## Output Settings ############## -# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner) -verbose_level = 3 +# v0.18.5+ +# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'} +# must be lower case +custom_allowed_remote_headers = {} # ############## Cache Settings ############## # Cache remote static files to your local storge. And access them directly from local storge if necessary. diff --git a/tests/regex_rewriter_test.py b/tests/regex_rewriter_test.py index 9681526..5b2cbdc 100644 --- a/tests/regex_rewriter_test.py +++ b/tests/regex_rewriter_test.py @@ -237,7 +237,11 @@ new Image().src = "/url?sa=T&url=" + esc_link + "&oi=" + e(oi)+ "&ct=" + e(ct);return false;}
&nb href="https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91">https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91
 href="#" onclick="return go_back();" onmousedown="ctu('unauthorizedredirect','originlink');>
""", ), - + ( + r"""""", + r"""""", + r"""""", + ) ) ColorfulPyPrint_set_verbose_level(5)