Skip to content

Commit

Permalink
3.1.1
Browse files Browse the repository at this point in the history
  • Loading branch information
cdhigh committed Jun 16, 2024
1 parent 638bafe commit 8c22778
Show file tree
Hide file tree
Showing 16 changed files with 516 additions and 595 deletions.
115 changes: 25 additions & 90 deletions application/lib/dictionary/mdict/mdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,20 +41,19 @@ def __init__(self, database='', host=None):
self.database = database
self.dictionary = None
if database in self.databases:
#try:
try:
self.dictionary = IndexedMdx(database)
#except Exception as e:
#default_log.warning(f'Instantiate mdict failed: {self.databases[database]}: {e}')
except Exception as e:
default_log.warning(f'Instantiate mdict failed: {self.databases[database]}: {e}')
else:
default_log.warning(f'dict not found: {self.databases[database]}')

#返回当前使用的词典名字
def __repr__(self):
return 'mdict [{}]'.format(self.databases.get(self.database, ''))

def definition(self, word, language=''):
ret = self.dictionary.get(word) if self.dictionary else ''
if isinstance(ret, bytes):
ret = ret.decode(self.dictionary.meta.get('encoding', 'utf-8'))
return ret
return self.dictionary.get(word) if self.dictionary else ''

#经过词典树缓存的Mdx
class IndexedMdx:
Expand All @@ -66,62 +65,35 @@ def __init__(self, fname, encoding="", substyle=False, passcode=None):
prefix = os.path.splitext(fname)[0]
dictName = os.path.basename(prefix)
trieName = f'{prefix}.trie'
metaName = f'{prefix}.meta'
self.trie = None
self.meta = {}
self.stylesheet = {}
if os.path.exists(trieName) and os.path.exists(metaName):
self.mdx = MDX(fname, encoding, substyle, passcode)
if os.path.exists(trieName):
try:
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT)
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT) #type:ignore
self.trie.load(trieName)
with open(metaName, 'r', encoding='utf-8') as f:
self.meta = json.loads(f.read())
if not isinstance(self.meta, dict):
self.meta = {}
self.stylesheet = json.loads(self.meta.get("stylesheet", '{}'))
except Exception as e:
self.trie = None
default_log.warning(f'Failed to load mdict trie data: {dictName}: {e}')

if self.trie and self.meta:
self.fMdx = open(fname, 'rb')
if self.trie:
return

#重建索引
default_log.info(f"Building trie for {dictName}")
mdx = MDX(fname, encoding, substyle, passcode)
dictIndex = mdx.get_index()
indexList = dictIndex["index_dict_list"]
#[(word, (params,)),...]
#为了能制作大词典,mdx中这些数据都是64bit的,但是为了节省空间,这里只使用32bit保存(>LLLLLL)
idxBuff = [(item["key_text"].lower(), (
item["file_pos"], #32bit
item["compressed_size"], #64bit
item["decompressed_size"], #64bit
item["record_start"], #64bit
item["record_end"], #64bit
item["offset"])) #64bit
for item in indexList]
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, idxBuff)
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT, self.mdx.get_index()) #type:ignore
self.trie.save(trieName)
self.meta = dictIndex['meta']
#mdx内嵌css,键为序号(1-255),值为元祖 (startTag, endTag)
self.stylesheet = json.loads(self.meta.get("stylesheet", '{}'))
with open(metaName, 'w', encoding='utf-8') as f:
f.write(json.dumps(self.meta))

self.fMdx = open(fname, 'rb')

del mdx

del self.trie
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT)
self.trie = marisa_trie.RecordTrie(self.TRIE_FMT) #type:ignore
self.trie.load(trieName)
del idxBuff
import gc
gc.collect()

#获取单词释义,不存在则返回空串
def get(self, word):
if not self.trie:
return ''
word = word.lower().strip()
indexes = self.trie[word] if word in self.trie else None
ret = self.get_content_by_Index(indexes)
Expand All @@ -138,47 +110,21 @@ def __contains__(self, word) -> bool:
#通过单词的索引数据,直接读取文件对应的数据块返回释义
#indexes是列表,因为可能有多个单词条目
def get_content_by_Index(self, indexes):
if not indexes:
return ''

ret = []
encoding = self.meta.get('encoding', 'utf-8')
for index in indexes:
filePos, compSize, decompSize, startPos, endPos, offset = index
self.fMdx.seek(filePos)
compressed = self.fMdx.read(compSize)
type_ = compressed[:4] #32bit-type, 32bit-adler, data
if type_ == b"\x00\x00\x00\x00":
data = compressed[8:]
elif type_ == b"\x01\x00\x00\x00":
#header = b"\xf0" + pack(">I", decompSize)
data = lzo.decompress(compressed[8:], initSize=decompSize, blockSize=1308672)
elif type_ == b"\x02\x00\x00\x00":
data = zlib.decompress(compressed[8:])
else:
continue
record = data[startPos - offset : endPos - offset]
ret.append(record.decode(encoding, errors="ignore").strip("\x00"))

txt = '<hr/>'.join(ret)
if self.stylesheet:
txt = self.replace_css(txt)

#很多人制作的mdx很复杂,可能需要后处理
return self.post_process(txt)
return self.post_process(self.mdx.get_content_by_Index(indexes))

#对查词结果进行后处理
def post_process(self, content):
if not content:
return ''

soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加body
soup = BeautifulSoup(content, 'html.parser') #html.parser不会自动添加html/body

#删除图像
for tag in soup.find_all('img'):
tag.extract()

self.inline_css(soup)
self.adjust_css(soup)
#self.inline_css(soup) #碰到稍微复杂一些的CSS文件性能就比较低下,暂时屏蔽对CSS文件的支持
self.remove_empty_tags(soup)

body = soup.body
Expand All @@ -187,9 +133,9 @@ def post_process(self, content):

return str(soup)

#将css样式内联到html标签中
def inline_css(self, soup):
# 首先删除 height 属性
#调整一些CSS
def adjust_css(self, soup):
#删除 height 属性
for element in soup.find_all():
if element.has_attr('height'):
del element['height']
Expand All @@ -200,6 +146,8 @@ def inline_css(self, soup):
del newStyle['height']
element['style'] = "; ".join(f"{k}: {v}" for k, v in newStyle.items())

#将外部单独css文件的样式内联到html标签中
def inline_css(self, soup):
link = soup.find('link', attrs={'rel': 'stylesheet', 'href': True})
if not link:
return
Expand All @@ -225,7 +173,7 @@ def inline_css(self, soup):
except Exception as e:
default_log.warning(f'parse css failed: {self.mdxFilename}: {e}')
return

for rule in cssRules:
if rule.type == rule.STYLE_RULE:
selector = rule.selectorText
Expand Down Expand Up @@ -263,16 +211,3 @@ def remove_empty_tags(self, soup, preserve_tags=None):
self.remove_empty_tags(tag, preserve_tags)
for tag in empty_tags:
tag.decompose()

#替换css,其实这个不是css,算是一种模板替换,不过都这么叫
def replace_css(self, txt):
txt_list = re.split(r"`\d+`", txt)
txt_tag = re.findall(r"`\d+`", txt)
txt_styled = txt_list[0]
for j, p in enumerate(txt_list[1:]):
style = self.stylesheet[txt_tag[j][1:-1]]
if p and p[-1] == "\n":
txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + "\r\n"
else:
txt_styled = txt_styled + style[0] + p + style[1]
return txt_styled
Loading

0 comments on commit 8c22778

Please sign in to comment.