Skip to content
This repository has been archived by the owner on Feb 28, 2019. It is now read-only.

Commit

Permalink
Automatic Domains Whitelist (Experimental)
Browse files Browse the repository at this point in the history
  • Loading branch information
aploium committed May 15, 2016
1 parent 3c33e72 commit 96ca4d1
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 9 deletions.
44 changes: 43 additions & 1 deletion EasyWebsiteMirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import zlib
import gzip
from time import time
from fnmatch import fnmatch
from html import escape as html_escape
import threading
from urllib.parse import urljoin, urlsplit, urlunsplit, quote_plus
Expand Down Expand Up @@ -44,7 +45,7 @@
errprint('Can Not Create Local File Cache: ', e, ' local file cache is disabled automatically.')
local_cache_enable = False

__VERSION__ = '0.18.6-dev'
__VERSION__ = '0.19.0-dev'
__author__ = 'Aploium <[email protected]>'

# ########## Basic Init #############
Expand Down Expand Up @@ -91,6 +92,9 @@
if not isinstance(target_static_domains, set):
target_static_domains = set()

if not enable_automatic_domains_whitelist:
domains_whitelist_auto_add_glob_list = tuple()

if not enable_individual_sites_isolation:
isolated_domains = set()
else:
Expand Down Expand Up @@ -210,6 +214,40 @@
#

# ########## Begin Utils #############
@lru_cache(maxsize=8192)
def is_domain_match_glob_whitelist(domain):
for domain_glob in domains_whitelist_auto_add_glob_list:
if fnmatch(domain, domain_glob):
return True
return False


def try_match_and_add_domain_to_rewrite_white_list(domain):
if domain is None or not domain:
return False
if domain in external_domains_set or domain == target_domain:
return True
if not is_domain_match_glob_whitelist(domain):
return False
else:
infoprint('A domain:', domain, 'was added to whitelist')

global external_domains, external_domains_set, allowed_domains_set
_buff = list(external_domains)
_buff.append(domain)
external_domains = tuple(_buff)
external_domains_set.add(domain)
allowed_domains_set.add(domain)

# write log
try:
with open('automatic_domains_whitelist.log', 'a', encoding='utf-8') as fp:
fp.write(domain + '\n')
except:
traceback.print_exc()

return True


def current_line_number():
"""Returns the current line number in our program."""
Expand Down Expand Up @@ -580,6 +618,10 @@ def regex_url_reassemble(match_obj):
# dbgprint('returned_un_touch', whole_match_string)
return whole_match_string

# v0.19.0+ Automatic Domains Whitelist (Experimental)
if enable_automatic_domains_whitelist:
try_match_and_add_domain_to_rewrite_white_list(match_domain)

remote_domain, _is_remote_https, remote_path = extract_real_domain_from_url_may_have_extdomains()
# dbgprint('remote_path:', remote_path, 'remote_domain:', remote_domain, 'match_domain', match_domain, v=5)
# dbgprint(match_obj.groups(), v=5)
Expand Down
30 changes: 23 additions & 7 deletions config.sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@
https='https://127.0.0.1:8123',
)

# ############## Output Settings ##############
# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner)
verbose_level = 3

# ############## Misc Settings ##############
# v0.18.4+ for some modern websites (google/wiki, etc), we can assume it well always use utf-8 encoding.
# or for some old-styled sites, we could also force the program to use gbk encoding (just for example)
Expand All @@ -84,18 +88,30 @@
# 设置为 None 表示关闭显式编码指定, 'utf-8' 代表utf-8
force_decode_remote_using_encode = None

# v0.18.5+
# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'}
# must be lower case
custom_allowed_remote_headers = {}
# v0.19.0+ Automatic Domains Whitelist (Experimental)
# by given wild match domains (glob syntax, '*.example.com'), if we got domains match these cases,
# it would be automatically added to the `external_domains`
# However, before you restart your server, you should check the 'automatic_domains_whitelist.log' file,
# and manually add domains to the config, or it would not work after you restart your server
# You CANNOT relay on the automatic whitelist, because the basic (but important) rewrite require specified domains to work.
# For More Supported Pattern Please See: https://docs.python.org/3/library/fnmatch.html#module-fnmatch
# 如果给定以通配符形式的域名, 当程序遇到匹配的域名时, 将会自动加入到 `external_domains` 的列表中
# 但是, 当你重启服务器程序前, 请检查程序目录下 'automatic_domains_whitelist.log' 文件,
# 并将里面的域名手动添加到 `external_domains` 的列表中 (因为程序不会在运行时修改本配置文件)
# 自动域名添加白名单功能并不能取代 `external_domains` 中一个个指定的域名,
# 因为基础重写(很重要)不支持使用通配符(否则会带来10倍以上的性能下降).
# 如果需要使用 * 以外的通配符, 请查看 https://docs.python.org/3/library/fnmatch.html#module-fnmatch 这里的的说明
enable_automatic_domains_whitelist = True
domains_whitelist_auto_add_glob_list = ('*.google.com', '*.gstatic.com', '*.google.com.hk')

# #####################################################
# ################# ADVANCED Settings #################
# #####################################################

# ############## Output Settings ##############
# Verbose level (0~3) 0:important and error 1:info 2:warning 3:debug. Default is 3 (for first time runner)
verbose_level = 3
# v0.18.5+
# eg: {'access-control-max-age', 'access-control-allow-origin', 'x-connection-hash'}
# must be lower case
custom_allowed_remote_headers = {}

# ############## Cache Settings ##############
# Cache remote static files to your local storge. And access them directly from local storge if necessary.
Expand Down
6 changes: 5 additions & 1 deletion tests/regex_rewriter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,11 @@
new Image().src = "/url?sa=T&url=" + esc_link + "&oi=" + e(oi)+ "&ct=" + e(ct);return false;}
</script></head><body><div class="_lFe"><div class="_kFe"><font style="font-size:larger"></div></div><div class="_jFe">&nb href="https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91">https://g.zju.tools:20822/extdomains/https-zh.wikipedia.org/zh-cn/%E7%BB%B4%E5%9F%BA%E7%99%BE%E7%A7%91</a><br>&nbsphref="#" onclick="return go_back();" onmousedown="ctu('unauthorizedredirect','originlink');><br></div></body></html> """,
),

(
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
r"""<a href="https://t.co/hWOMicwES0" rel="nofollow" dir="ltr" data-expanded-url="http://onforb.es/1NqvWJT" class="twitter-timeline-link" target="_blank" title="http://onforb.es/1NqvWJT"><span class="tco-ellipsis"></span><span class="invisible">http://</span><span class="js-display-url">onforb.es/1NqvWJT</span><span class="invisible"></span><span class="tco-ellipsis"><span class="invisible">&nbsp;</span></span></a>""",
)
)
ColorfulPyPrint_set_verbose_level(5)

Expand Down

0 comments on commit 96ca4d1

Please sign in to comment.