diff --git a/.ci/aptPackagesToInstall.txt b/.ci/aptPackagesToInstall.txt new file mode 100644 index 0000000..e69de29 diff --git a/.ci/pythonPackagesToInstallFromGit.txt b/.ci/pythonPackagesToInstallFromGit.txt new file mode 100644 index 0000000..84ad2e6 --- /dev/null +++ b/.ci/pythonPackagesToInstallFromGit.txt @@ -0,0 +1 @@ +https://github.com/KOLANICH-libs/WordSplitAbs.py diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..843ba14 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,12 @@ +root = true + +[*] +charset = utf-8 +indent_style = tab +indent_size = 4 +insert_final_newline = true +end_of_line = lf + +[*.{yml,yaml,ksy}] +indent_style = space +indent_size = 2 diff --git a/.github/.templateMarker b/.github/.templateMarker new file mode 100644 index 0000000..5e3a3e0 --- /dev/null +++ b/.github/.templateMarker @@ -0,0 +1 @@ +KOLANICH/python_project_boilerplate.py diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..89ff339 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 +updates: + - package-ecosystem: "pip" + directory: "/" + schedule: + interval: "daily" + allow: + - dependency-type: "all" diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml new file mode 100644 index 0000000..805a383 --- /dev/null +++ b/.github/workflows/CI.yml @@ -0,0 +1,15 @@ +name: CI +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + build: + runs-on: ubuntu-20.04 + steps: + - name: typical python workflow + uses: KOLANICH-GHActions/typical-python-workflow@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2d49db2 --- /dev/null +++ b/.gitignore @@ -0,0 +1,15 @@ +__pycache__ +*.pyc +*.pyo +/*.egg-info +/cache +#/mapi_tags.ksy +/mapi_tags.py + +*.srctrlbm +*.srctrldb +build +dist +.eggs +monkeytype.sqlite3 +/.ipynb_checkpoints diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..6e5ddf8 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,51 @@ +image: registry.gitlab.com/kolanich-subgroups/docker-images/fixed_python:latest + +variables: + DOCKER_DRIVER: overlay2 + SAST_ANALYZER_IMAGE_TAG: latest + SAST_DISABLE_DIND: "true" + SAST_CONFIDENCE_LEVEL: 5 + CODECLIMATE_VERSION: latest + +include: + - template: SAST.gitlab-ci.yml + - template: Code-Quality.gitlab-ci.yml + - template: License-Management.gitlab-ci.yml + +build: + tags: + - shared + - linux + stage: build + variables: + GIT_DEPTH: "1" + PYTHONUSERBASE: ${CI_PROJECT_DIR}/python_user_packages + + before_script: + - export PATH="$PATH:$PYTHONUSERBASE/bin" # don't move into `variables` + - apt-get update + # todo: + #- apt-get -y install + #- pip3 install --upgrade + #- python3 ./fix_python_modules_paths.py + + script: + - python3 -m build -nw bdist_wheel + - mv ./dist/*.whl ./dist/python_project_boilerplate-0.CI-py3-none-any.whl + - pip3 install --upgrade ./dist/*.whl + - coverage run --source=python_project_boilerplate -m --branch pytest --junitxml=./rspec.xml ./tests/test.py + - coverage report -m + - coverage xml + + coverage: /^TOTAL(?:\s+\d+){4}\s+(\d+%).+/ + + cache: + paths: + - $PYTHONUSERBASE + + artifacts: + paths: + - dist + reports: + junit: ./rspec.xml + cobertura: ./coverage.xml diff --git a/Code_Of_Conduct.md b/Code_Of_Conduct.md new file mode 100644 index 0000000..bcaa2bf --- /dev/null +++ b/Code_Of_Conduct.md @@ -0,0 +1 @@ +No codes of conduct! \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..20f0fa8 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,4 @@ +include UNLICENSE +include *.md +include tests +include .editorconfig diff --git a/MAPITagsScraper/DOMUtils.py b/MAPITagsScraper/DOMUtils.py new file mode 100644 index 0000000..0734f9f --- /dev/null +++ b/MAPITagsScraper/DOMUtils.py @@ -0,0 +1,38 @@ +import typing +from io import StringIO +from xml.dom.minidom import Element + + +def minidom2str(md: Element) -> str: + with StringIO() as f: + md.writexml(f, indent="\t", addindent="\t", newl="\n") + return f.getvalue() + + +def minidom2bs4(md: Element) -> "bs4.BeautifulSoup": + import bs4 + + return bs4.BeautifulSoup(minidom2str(md), "lxml") + + +def getTextFromNodes(node: Element) -> typing.Iterable[str]: + if node.nodeType == node.TEXT_NODE: + yield node.data + else: + for cn in node.childNodes: + yield from getTextFromNodes(cn) + + +def node2text(node: Element) -> str: + return "".join(getTextFromNodes(node)) + + +def iterNextSiblings(n: Element) -> typing.Iterable[Element]: + while n.nextSibling: + n = n.nextSibling + yield n + + +def textAfter(n: Element) -> str: + """Get the text after the teg within the parent element untill its end""" + return "".join(map(node2text, iterNextSiblings(n))) diff --git a/MAPITagsScraper/KSEnumValue.py b/MAPITagsScraper/KSEnumValue.py new file mode 100644 index 0000000..d93654a --- /dev/null +++ b/MAPITagsScraper/KSEnumValue.py @@ -0,0 +1,16 @@ +import typing + + +class KSEnumValue: + __slots__ = ("id", "value", "origIds", "doc", "companion", "subject") + + def __init__(self, iD: str, value: int, origIds: typing.Iterable[str], doc: str, companion: str = None, subject: str = None): + self.id = iD + self.value = value + self.origIds = origIds + self.doc = doc + self.companion = companion + self.subject = subject + + def __repr__(self): + return self.__class__.__name__ + "(" + ", ".join(repr(getattr(self, k)) for k in self.__class__.__slots__) + ")" diff --git a/MAPITagsScraper/Source.py b/MAPITagsScraper/Source.py new file mode 100644 index 0000000..90c949a --- /dev/null +++ b/MAPITagsScraper/Source.py @@ -0,0 +1,53 @@ +from pathlib import Path + + +def cachedFetchFile(cacheFile, uri): + cacheFile.parent.mkdir(parents=True, exist_ok=True) + + if cacheFile.is_file(): + return cacheFile.read_bytes() + + import httpx + + data = httpx.get(uri).content + cacheFile.write_bytes(data) + return data + + +class ProtoSource: + __slots__ = ("name", "traditionalFileName") + + def __init__(self, name, traditionalFileName): + self.name = name + self.traditionalFileName = traditionalFileName + + def fetch(self, fileDir: Path) -> str: + return (fileDir / self.traditionalFileName).read_text(encoding="utf-8") + + def parseEnumValues(self, fileDir: Path): + return self.parseValuesFromSrc(self.fetch(fileDir)) + + def parseValuesFromSrc(self, src): + raise NotImplementedError + + def __repr__(self): + return self.__class__.__name__ + "(" + ", ".join(repr(getattr(self, k)) for k in __class__.__slots__) + ")" + + +class Source(ProtoSource): + __slots__ = ("uri", "license") + + def __init__(self, name, cachedFileName, uri, license): + super().__init__(name, cachedFileName) + self.uri = uri + self.license = license + + @property + def cachedFileName(self): + return self.traditionalFileName + + def fetch(self, cacheDir: Path) -> str: + return cachedFetchFile(cacheDir / self.cachedFileName, self.uri).decode("utf-8") + + def __repr__(self): + return self.__class__.__name__ + "(" + ", ".join(repr(getattr(self, k)) for k in (__class__.__slots__ + super().__slots__)) + ")" diff --git a/MAPITagsScraper/__init__.py b/MAPITagsScraper/__init__.py new file mode 100644 index 0000000..0525b20 --- /dev/null +++ b/MAPITagsScraper/__init__.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import re +import typing +from ast import literal_eval +from copy import deepcopy +from io import StringIO +from pathlib import Path +from warnings import warn + +from .KSEnumValue import KSEnumValue +from .nameNormalizer import * +from .sources.kaitai import _kaitai +from .utils import dedupPreservingOrder +from .sources.kaitai import SerializingContext + +__all__ = ("fullPipeline",) + + +def getTagsWithNonUniqueNames(): + return [(el.id, el.origIds) for el in parsed if "_or_" in el.id] + + +def normalizeUniqueNames(t): + from .nameNormalizer import convertName + + for k in list(t.keys()): + if not isinstance(t[k], str): + if t[k]["id"] == "unkn": + oids = list(dedupPreservingOrder(t[k]["-orig-id"])) + if len(oids) == 1: + v = list(oids)[0] + fv = convertName(v) + t[k]["id"] = fv + t[k]["-orig-id"] = v + + +def mergeSourceIntoContext(ctx: SerializingContext, s: Source.Source, cacheDir: Path) -> None: + tagsFromSource = s.parseEnumValues(cacheDir) + ctx.enumValues2KSEnumDict(tagsFromSource) + ctx.insertSource(s.uri, s.license) + + +def fullPipeline(outputDir: Path, sourcesList: typing.Iterable[Source.Source], cacheDir: Path) -> None: + ctx = _kaitai._getCtxFromDir(outputDir) + + for s in sourcesList: + mergeSourceIntoContext(ctx, s, cacheDir) + + ctx.sortByKey() + ctx.dump(outputDir) diff --git a/MAPITagsScraper/__main__.py b/MAPITagsScraper/__main__.py new file mode 100644 index 0000000..bd11228 --- /dev/null +++ b/MAPITagsScraper/__main__.py @@ -0,0 +1,57 @@ +from pathlib import Path + +from plumbum import cli + +from .consts import defaultCacheDir +from .sources import sources + + +sourcesParamValidator = cli.Set(*sources, case_sensitive=True, csv=True) + + +class CLI(cli.Application): + """A tool to generate a Kaitai Struct spec with MAPI tags enum""" + + +@CLI.subcommand("fetch") +class FetchCLI(cli.Application): + """Just downloads the files into a cache dir""" + + def main(self, sourceNames: sourcesParamValidator, cacheDir: str = defaultCacheDir): + cacheDir = Path(cacheDir) + + for sourceName in sourceNames: + s = sources[sourceName] + print("Ensuring", s) + s.fetch(cacheDir) + + +@CLI.subcommand("convert") +class ConvertCLI(cli.Application): + """Converts the files into Kaitai Struct spec with tag definitions""" + + def main(self, sourceNames: sourcesParamValidator, cacheDir: str = defaultCacheDir): + cacheDir = Path(cacheDir) + + from . import fullPipeline + + sourcesList = [sources[sourceName] for sourceName in sourceNames] + + fullPipeline(Path("."), sourcesList, cacheDir) + + +@CLI.subcommand("check") +class CheckCLI(cli.Application): + """Just a sanity check to guide manual name assigning""" + + def main(self, cacheDir: str = defaultCacheDir): + from pprint import pprint + + from . import getTagsWithNonUniqueNames + + cacheDir = Path(cacheDir) + pprint(getTagsWithNonUniqueNames(cacheDir)) + + +if __name__ == "__main__": + CLI.run() diff --git a/MAPITagsScraper/consts.py b/MAPITagsScraper/consts.py new file mode 100644 index 0000000..8a21a8b --- /dev/null +++ b/MAPITagsScraper/consts.py @@ -0,0 +1,5 @@ +from pathlib import Path + +defaultCacheDir = Path("./cache") +defaultKSYFileName = "mapi_tags.ksy" +GitHubRawBase = "https://raw.githubusercontent.com/" diff --git a/MAPITagsScraper/mapi_tags.template.ksy b/MAPITagsScraper/mapi_tags.template.ksy new file mode 100644 index 0000000..1a97583 --- /dev/null +++ b/MAPITagsScraper/mapi_tags.template.ksy @@ -0,0 +1,18 @@ +meta: + id: mapi_tags + title: Outlook MAPI tags + application: + - Microsoft Outlook MAPI + - Microsoft Exchange + +doc: | + Outlook MAPI tags are enums values used to identify various types of entities in various formats. + +doc-ref: + - https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxprops/f6ab1613-aefe-447d-a49c-18217230b148 + - https://docs.microsoft.com/en-us/openspecs/exchange_server_protocols/ms-oxocntc/9b636532-9150-4836-9635-9c9b756c9ccf + - https://github.com/hfig/MAPI/blob/master/src/MAPI/Schema/MapiFieldsMessage.yaml # MIT + - https://github.com/hfig/MAPI/blob/master/src/MAPI/Schema/MapiFieldsOther.yaml # MIT + - https://github.com/nektra/outlook-autocomplete/blob/master/OlAutoComplete/nk2props.h # MIT + - https://github.com/stephenegriffin/mfcmapi/blob/151856e6ef5af42368a49a1340060aa58d981e8e/core/interpret/genTagArray.h # MIT + - https://github.com/dbremner/pstviewtool/blob/52f59893ad4390358053541b0257b4a7f2767024/ptags.h # Likely Apache. The repo contains no license, but the news (https://www.infoq.com/news/2010/05/Outlook-PST-View-Tool-and-SDK/, also https://web.archive.org/web/20140704101722/http://www.microsoft.com/en-us/news/press/2010/may10/05-24psttoolspr.aspx) claim that this tool and https://github.com/enrondata/pstsdk were published under Apache. Looks plausible since both software were authored by Terry Mahaffey (psviewtool has user name terrymah (though without a proper email) in git commits, likely the same guy as https://github.com/terrymah, pstsdk has the lines `\author Terry Mahaffey`) diff --git a/MAPITagsScraper/nameNormalizer.py b/MAPITagsScraper/nameNormalizer.py new file mode 100644 index 0000000..cb8d2c0 --- /dev/null +++ b/MAPITagsScraper/nameNormalizer.py @@ -0,0 +1,508 @@ +import re +import typing +from enum import IntEnum + +import inflection + + +from .utils import dedupPreservingOrder + +__all__ = ("canonicalizeOrigName", "convertName") + +useWordNinja = False + +if useWordNinja: + import wordninja + wnModel = wordninja.LanguageModel('mapiWordNinjaModel.txt.gz') + +numUnderscoreSeparatedStr = re.compile(r"([h-zH-Z]+[a-zA-Z]+|[a-zA-Z]+[h-zH-Z]+)_(\d+)$") +allowedKSIdRx = re.compile(r"^\w+$") +multipleUnderscoresRx = re.compile(r"_+") + + +def attachNumber(s: str) -> str: + return numUnderscoreSeparatedStr.subn("\\1\\2", s)[0] + +def fixMultipleUnderscores(s: str) -> str: + return multipleUnderscoresRx.subn("_", s)[0] + + +W_POSTFIX = "_W" +A_POSTFIX = "_A" + + +def clearPostfixes(n: str) -> str: + if n.endswith(W_POSTFIX): + n = n[: -len(W_POSTFIX)] + elif n.endswith(A_POSTFIX): + n = n[: -len(A_POSTFIX)] + return n + +class OrigIdType(IntEnum): + unknown = 0 + PID_TAG = 1 + PTAG = 2 + PR_TAG = 3 + INT_SCH_TAG = 4 + +origIdTypeToPrefixMapping = { + OrigIdType.PID_TAG: "PidTag", + OrigIdType.PTAG: "ptag", + OrigIdType.PR_TAG: "PR_", + OrigIdType.INT_SCH_TAG: "InternalSchema", +} + +PR_TAG = origIdTypeToPrefixMapping[OrigIdType.PR_TAG] + + +def canonicalizeOrigName(n: str) -> str: + """Removes unneeded prefixes and postfixes from `-orig-id`s""" + if n.startswith(PR_TAG): + n = clearPostfixes(n) + return n + + +wordsSplitterFilters = { + OrigIdType.PR_TAG: ( + ('emsmdb', 'ems_mdb'), + ('addrtype', 'addr_type'), + ('oraddress', 'or_address'), + ('storeeid', 'store_eid'), + ('_svreid', '_svr_eid'), + ('seqid', 'seq_id'), + ('draftid', 'draft_id'), + ('srchid', 'srch_id'), + ('oflid', 'ofl_id'), + ('entryid', 'entry_id'), + ('linkid', 'link_id'), + ('replacetime', 'replace_time'), + ('trackstatus', 'track_status'), + ('clientid', 'client_id'), + ('parentid', 'parent_id'), + ('enabledon', 'enabled_on'), + ('onserver', 'on_server'), + ('schdinfo_', 'schd_info_'), + ('_freebusy_', '_free_busy_'), + ('_mtsout_', '_mts_out_'), + ('_mtsin_', '_mts_in_'), + ('xmlstream', 'xml_stream'), + ('containerid', 'container_id'), + ('templateid', 'template_id'), + ('proposedendtime', 'proposed_end_time'), + ('proposedstarttime', 'proposed_starttime'), + ('starttime', 'start_time'), + ('contactphoto', 'contact_photo'), + ('freebusy', 'free_busy'), + ('sendpost', 'send_post'), + ('readpost', 'read_post'), + ('reportnote', 'report_note'), + ('sendnote', 'send_note'), + ('readnote', 'read_note'), + ('endtxt', 'end_txt'), + ('begintxt', 'begin_txt'), + ('bodytag', 'body_tag'), + ('migrateprofile', 'migrate_profile'), + ('changenum', 'change_num'), + ('versionhistory', 'version_history'), + ('versionskeleton', 'version_skeleton'), + ('serverid', 'server_id'), + ('subitemid', 'subitem_id'), + ('inetmail', 'inet_mail'), + ('dotstuff', 'dot_stuff'), + ('newsfeed', 'news_feed'), + ('peruser', 'per_user'), + ('mailbeat', 'mail_beat'), + ('hotsite', 'hot_site'), + ('endtime', 'end_time'), + ('fixfont', 'fix_font'), + ('ccwrap', 'cc_wrap'), + ('metatag', 'meta_tag'), + ('iconurl', 'icon_url'), + ('itemproc', 'item_proc'), + ('viewinfo', 'view_info'), + ('displayname', 'display_name'), + ('fxsrcstream', 'fx_src_stream'), + ('fxdeststream', 'fx_dest_stream'), + ('othermailbox', 'other_mailbox'), + ('viewprivate', 'view_private'), + ('foldertype', 'folder_type'), + ('viewtype', 'view_type'), + ('ostid', 'ost_id'), + ('shareddata', 'shared_data'), + ('notfound', 'not_found'), + ('mapiuid', 'mapi_uid'), + ('mapiform', 'mapi_form'), + ('phonebook', 'phone_book'), + ('testclsid', 'test_clsid'), + ('labeleduri', 'labeled_uri'), + ('dispname', 'disp_name'), + ('syncevent', 'sync_event'), + ('slowlink', 'slow_link'), + ('dialup', 'dial_up'), + ('waitfor', 'wait_for'), + ('mimewrap', 'mime_wrap'), + ('tcpip', 'tcp_ip'), + ('traceinfo', 'trace_info'), + ('spamtype', 'spam_type'), + ('userfields', 'user_fields'), + ('viewlist', 'view_list'), + ('clearprops', 'clear_props'), + ('logfile', 'log_file'), + ('deltax', 'delta_x'), + ('deltay', 'delta_y'), + ('xpos', 'x_pos'), + ('ypos', 'y_pos'), + ('mailfrom', 'mail_from'), + ('datainit', 'data_init'), + ('dataterm', 'data_term'), + ('outq_', 'out_q_'), + ('inq_', 'in_q_'), + ('datablock', 'data_block'), + ('viewflags', 'view_flags'), + ('saveas', 'save_as'), + ('folderid', 'folder_id'), + ('portno', 'port_no'), + ('bifinfo', 'bif_info'), + ('msgtracking', 'msg_tracking'), + ('autoresponse', 'auto_response'), + ('favfld', 'fav_fld'), + ('bodypart', 'body_part'), + ('listinfo', 'list_info'), + ('reqcn', 'req_cn'), + ('reqname', 'req_name'), + ('insadmin', 'ins_admin') + ), + OrigIdType.PID_TAG: ( + ('_un_modified', '_unmodified'), + ('msgid', 'msg_id'), + ('itemid', 'item_id'), + ('replid', 'repl_id'), + ('guid', 'guid_'), + ), + OrigIdType.PTAG: ( + ("replid", "repl_id"), + ), + None: ( + ('temporaryflags', 'temporary_flags'), + ('errorinfo', 'error_info'), + ('msgsize', 'msg_size'), + ('attachlist', 'attach_list'), + ('changenum', 'change_num'), + ('addrbook', 'addr_book'), + ('rootdir', 'root_dir'), + ('msgclass', 'msg_class'), + ('messageclass', 'message_class'), + ('mtsid', 'mts_id'), + ('sentmail', 'sent_mail'), + ('to_do_', 'todo_'), + ('subfolder', 'sub_folder'), + ('rowid', 'row_id'), + ('recurrenceid', 'recurrence_id'), + ('readonly', 'read_only'), + ('pathname', 'path_name'), + ('templateid', 'template_id'), + ('datatype', 'data_type'), + ('codepage', 'code_page'), + ('_replid', '_repl_id'), + ('webviewinfo', 'webview_info'), + ('webview', 'web_view'), + ('mailuser', 'mail_user'), + ('longterm', 'long_term'), + ('newsfeed', 'news_feed') + ) +} + +filters = { + OrigIdType.PR_TAG: ( + ('_oab_', '_offline_address_book_'), + ('ems_ab_', 'address_book_'), + ('_addr_', '_address_'), + ('_auth_', '_authorized_'), + ('_deliv_', '_delivery_'), + ('abeid', 'address_book_eid'), + ('_eid', '_entry_id'), + ('splus', 'schd_plus'), + ('_hab_', '_hier_'), + ('_dl', '_distr_list'), + ('_mhs_', '_message_handling_system_'), + ('_mta', '_message_transfer_agent'), + ('_reckey', '_record_key'), + ('wb_sf_', 'wb_search_folder_'), + ('_cont_', '_content_'), + ('_eid', '_entry_id'), + ('loglev', 'log_level'), + ('vrfy', 'verify'), + ('_hdrs_', '_headers_') + ), + OrigIdType.PID_TAG: ( + ('security_descriptor', 'nt_security_descriptor'), + ('_distribution_list', '_distr_list'), + ('_unauthorized_', '_unauth_'), + ('_away', 'oof'), + ('_t_bl_', '_table_') + ), + None: ( + ('appointment', 'appt'), + ('certificate', 'cert'), + ('recipient_', 'rcpt_'), + ('access_control_list_', 'acl_'), + ('hierarchical', 'hier'), + ('address', 'addr'), + ('message', 'msg'), + ('hasattach', 'has_attachments'), + ('_extended', '_ex'), + ('_eid', '_entry_id'), + ('_telephone_', '_phone_'), + ('received_', 'rcvd_'), + ('number', 'num'), + ('_object_', '_obj_'), + ('_message_', '_msg_'), + ('internet', 'inet'), + ('acct', 'account'), + ('maximum', 'max'), + ('minimum', 'min'), + ('transmitable', 'transmittable'), + ('_binary', '_bin'), + ('_mid_', '_msg_id_'), + ('_cpid', '_code_page_id'), + ('dam_', 'deferred_action_message_'), + ('attribute', 'attr'), + ('schedule_', 'schd_'), + ) +} + + +def processFilterBank(s, bank): + for f in bank: + s = s.replace(*f) + return s + + +wordninjaFalsePositives = ( + "corre_lat_or", + "e_its", + "in_it", + "i_pms", + "rec_ip", + "i_pm", + "x_400", + "x_500", + "x_509", + "x_25", + "rfc_1006", + "a_ddr", + "re_pl", + "rc_vd", + "a_ppt", + "tn_ef", + "ds_a", + "fr_eq", + "a_lg", + "auto_reply", + "time_out", + "a_ck", + "re_cv", + "rcp_t", + "canonical_iz_ation", + "map_i", + "tn_s", + "e_smtp", + "e_trn", + "s_mime", + "synchronize_r", + "rt_f", + "acc_t", + "gui_d", + "mid_set", + "x_mt", + "sch_d", + "spool_er", + "nts_d", + "n_td_n", + "s_rc", + "s_can", + "de_st", + "i_mail", + "rm_q", + "x_ref", + "t_bl", + "ow_a", + "at_tr", + "p_1", + "u_id", + "cl_sid", + "out_box", + "m_db", + "as_soc", + "p_2", + "pre_c", + "loop_back", + "re_calc", + "de_queue", + "m_gr", + "au_th", + "start_tls", + "ku_lane", + "dia_g", + "d_is_tr", + "n_ntp", + "if_s", + "an_r", + "c_dorm", + "c_doo_or", + "cd_of_bc", + "s_vr", + "transmit_able", + "tty_tdd", + "pa_b", + "a_cl", + "du_a", + "ad_atp_3", + "con_v", + "p_km", + "version_ing", + "l_cid", + "in_cr", + "re_q", + "rg_m", + "c_pid", + "fl_d", + "ex_ch_50", + "mb_in", + "addr_s", + "o_of", + "sr_ch", + "o_ab", + "of_l", + "open_ning", + "encrypt_er", + "fa_v", + "m_sdos", + "dx_a", + "roll_over", + "back_off", + "de_sig", + "una_u_th", + "x_121", + "xm_it", + "l_dap", + "cf_g", + "adr_s", + "mt_s", + "pui_d", + "mon_the_s", + "x_view", + "log_on", + "cate_g", + "back_fill", + "in_st", + "de_liv", + "appt_s", + "del_s", + "reqc_n", + "telet_ex" +) +wordninjaFalsePositives = [(el, el.replace("_", "")) for el in wordninjaFalsePositives] + + +def fix_after_wordninja(name): + return attachNumber(processFilterBank(name, wordninjaFalsePositives)) + + +def detectOrigIdTypeAndSplitFromRestOfName(name: str) -> OrigIdType: + for k, v in origIdTypeToPrefixMapping.items(): + if k: + if name.startswith(v): + return k, name[len(v) :] + + return OrigIdType.unknown, name + + +def processFilterBundle(tp: OrigIdType, name: str, bundle, middleFixerFunc = None) -> str: + filterBank = bundle.get(tp, ()) + name = processFilterBank(name, filterBank) + + if middleFixerFunc: + name = middleFixerFunc(name) + + name = processFilterBank(name, bundle[None]) + + return name + + +def splitJoinedWords(tp: OrigIdType, n: str, useWordNinja: bool = useWordNinja) -> str: + """Tries to normalize a name the way that different kinds of source names result into the same name. Also tries to make the name more easy to read""" + + #ic("splitJoinedWords", n) + + def middleFixerFunc(n: str) -> str: + if useWordNinja: + n = fix_after_wordninja("_".join(wnModel.split(n))) + n = convertName(n, useWordNinja=False) + return n + + n = processFilterBundle(tp, n, wordsSplitterFilters, middleFixerFunc) + + n = fixMultipleUnderscores(n) + + while n[-1] == "_": + n = n[:-1] + + while n[0] == "_": + n = n[1:] + + return n + +from icecream import ic + +typesConversionToUnderscoredRequired = frozenset((OrigIdType.PTAG, OrigIdType.PID_TAG, OrigIdType.INT_SCH_TAG)) + + +def convertName(n: str, useWordNinja: bool = useWordNinja) -> str: + """Tries to normalize a name the way that different kinds of source names result into the same name. Also tries to make the name more easy to read""" + + tp, n = detectOrigIdTypeAndSplitFromRestOfName(n) + #ic(tp, n) + + #if tp in typesConversionToUnderscoredRequired: + if tp != OrigIdType.PR_TAG: # typesConversionToUnderscoredRequired are all except OrigIdType.PR_TAG + n = inflection.underscore(n) + #ic("inflected", n) + else: + n = n.lower() + + def middleFixerFunc(n): + #ic("After specific filter bundle", n) + + if tp == OrigIdType.PR_TAG: + n = attachNumber(n) + elif tp == OrigIdType.PTAG: + n = n.replace("MTA", "MessageTransferAgent") + + n = n.lower() + if "attachment" not in n: + n.replace("has_attach", "has_attachments") + + n = splitJoinedWords(tp, n, useWordNinja=useWordNinja) + return n + + n = processFilterBundle(tp, n, filters, middleFixerFunc) + + return n + + +def selectAndConvertNamesAdmissibleToId(origIds): + for el in origIds: + if allowedKSIdRx.match(el): + yield convertName(el) + + +def prepareNamesAndOrigIds(origIds, sort: bool=True): + origIds = list(dedupPreservingOrder(canonicalizeOrigName(el) for el in origIds)) + names = selectAndConvertNamesAdmissibleToId(origIds) + if sort: + names = sorted(set(names)) + else: + names = dedupPreservingOrder(names) + + return origIds, names diff --git a/MAPITagsScraper/sources/OxProps.py b/MAPITagsScraper/sources/OxProps.py new file mode 100644 index 0000000..1e1b5c9 --- /dev/null +++ b/MAPITagsScraper/sources/OxProps.py @@ -0,0 +1,193 @@ +import re +import xml.dom.minidom +from ast import literal_eval +from pathlib import Path +from xml.dom.minidom import Element + +import commonmark +import docutils +import docutils.frontend +import recommonmark +from dom_query import select, select_all +from recommonmark.parser import CommonMarkParser + +from ..DOMUtils import * +from ..KSEnumValue import KSEnumValue +from ..nameNormalizer import canonicalizeOrigName, convertName, prepareNamesAndOrigIds +from ..Source import Source, cachedFetchFile + +__all__ = ("OxProps",) + +oxprops_feed = "https://interoperability.blob.core.windows.net/files/MS-OXPROPS/%5bMS-OXPROPS%5d.rss" + +companionValueDocRX = re.compile(r"Contains the (?:value of (?:the (?P[^()\.]+?)(?:'s)?\s+)?|(?P[^()\.]+?) of the )(?PPid\w+) property \(section \d+(\.\d+)+\)(?:\.|, and)?") +propsInDocsRX = re.compile(r"(?PPid\w+) property \((?:\[MS-\w+\] )?section \d+(\.\d+)+\)") + + +def filterPropsInDoc(doc: str): + return propsInDocsRX.subn(lambda x: "`" + convertName(canonicalizeOrigName(x.group("propName"))) + "`", doc)[0] + + +def parseMarkdown(t: typing.Union[str, Path]) -> Element: + if isinstance(t, Path): + t = t.read_text(encoding="utf-8") + s = docutils.frontend.OptionParser(components=(CommonMarkParser,)).get_default_values() + p = CommonMarkParser() + doc = docutils.utils.new_document(None, s) + p.parse(t, doc) + parsedDocs = doc.asdom() + doc = next(iter(parsedDocs.childNodes)) + return doc + + +def pars2kvpairs(pars: typing.Iterable[Element]) -> typing.Iterable[typing.Tuple[str, str]]: + """Convert a list of `paragraph` tags into a tuple (key, value), where `key` is the content of `strong` tag, and `value` is the rest of `paragraph` content. In MS docs converted to MarkDown in the sections we are interested in`strong` are headers and the rest of is contents""" + for par in pars: + nameNode = select(par, "strong") + name = node2text(nameNode).strip() + cont = textAfter(nameNode).strip() + if name.endswith(":"): + name = name[:-1] + yield (name, cont) + + +def pars2map(pars: typing.Iterable[Element]) -> typing.Mapping[str, str]: + """Convert `paragraph` tags into a map""" + return dict(pars2kvpairs(pars)) + + +def getPidTagSections(md: Element) -> typing.Iterable[Element]: + """Gets Markdown sections that contain info about `PidTag`s""" + + for sec in select_all(md, "section[names][ids]"): + if sec.attributes["ids"].value.startswith("pidtag"): + yield sec + + +class SectionDict2Enum: + __slots__ = () + + DESCR_NAME = "Description" + CAN_NAME = "Canonical name" + PROP_ID_NAME = "Property ID" + ALT_NAMES_NAME = "Alternate names" + + def __call__(self, smap, origIds): + doc = smap.get(self.__class__.DESCR_NAME, None) + cName = smap.get(self.__class__.CAN_NAME, None) + if cName is not None: + origIds.append(cName) + valueSrc = smap.get(self.__class__.PROP_ID_NAME, None) + altNames = smap.get(self.__class__.ALT_NAMES_NAME, None) + if altNames: + origIds.extend(el.strip() for el in altNames.split(",")) + + return origIds, valueSrc, doc + + +sectDict2Enum = SectionDict2Enum() + + +def parseSectionIntoEnumItem(sec): + name = node2text(select(sec, "title")).strip() + return sectDict2Enum(pars2map(select_all(sec, "paragraph")), [name]) + + +class DocxMarkdownSource(Source): + __slots__ = ("cachedFileStem",) + + def __init__(self, name, cachedFileStem, uri, license): + self.cachedFileStem = cachedFileStem + super().__init__(name, cachedFileStem + ".md", uri, license) + + def fetch(self, cacheDir: Path) -> str: + markdownCacheFile = cacheDir / self.cachedFileName + + if markdownCacheFile.is_file(): + return markdownCacheFile.read_text() + + data = convertDOCX2MarkDownMemory(getOxPropsDocx(cacheDir, self.cachedFileStem)) + markdownCacheFile.write_text(data) + return data + + def parseValuesFromSrc(self, src): + md = parseMarkdown(src) + return [KSEnumValueFromSection(sec) for sec in getPidTagSections(md)] + + +def KSEnumValueFromSection(sec): + origIds, valueSrc, doc = parseSectionIntoEnumItem(sec) + origIds, names = prepareNamesAndOrigIds(origIds, True) + value = literal_eval(valueSrc) + + companion = None + subj = None + if doc: + compMatch = companionValueDocRX.match(doc) + if compMatch: + subj = compMatch.group("subject1") + if not subj: + subj = compMatch.group("subject2") + subj = subj.strip() + + rawPropName = compMatch.group("propName").strip() + convertedPropName = convertName(canonicalizeOrigName(rawPropName)) + + s = compMatch.span() + doc = doc[: s[0]] + doc[s[1] :] + companion = convertedPropName + + if doc.startswith(", "): + doc = doc[2:] + + doc = filterPropsInDoc(doc) + + if not doc: + doc = None + + return KSEnumValue("_or_".join(names), value, origIds, doc, companion, subj) + + +def getOxPropsDocxLink(): + import httpx + from bs4 import BeautifulSoup + from dom_query import select + + r = httpx.get(oxprops_feed) + d = xml.dom.minidom.parseString(r.text) + + i = select(d, "item") + + h = node2text(select(i, "description")) + hd = BeautifulSoup(h, "lxml") + for el in hd.select("a"): + lh = el["href"] + if lh.endswith(".docx"): + return lh + + +def getOxPropsDocx(cacheDir: Path, cachedFileStem: str): + return cachedFetchFile(cacheDir / (cachedFileStem + ".docx"), getOxPropsDocxLink()) + + +targetMDFormat = "gfm+smart" +srcFmt = "docx" +extraPandocArgs = ( + "--from=docx", + "--wrap=none", +) + + +def convertDOCX2MarkDownFile(inputFile: Path) -> str: + import pypandoc + + return pypandoc.convert_file(str(inputFile), targetMDFormat, extra_args=extraPandocArgs) + + +def convertDOCX2MarkDownMemory(docx) -> str: + import pypandoc + + return pypandoc.convert_text(docx, targetMDFormat, srcFmt, extra_args=extraPandocArgs) + + +oxprops = DocxMarkdownSource("oxprops", "[MS-OXPROPS]", oxprops_feed, "Microsoft proprietary, but reuse in other impls is explicitly allowed") diff --git a/MAPITagsScraper/sources/__init__.py b/MAPITagsScraper/sources/__init__.py new file mode 100644 index 0000000..e240ed2 --- /dev/null +++ b/MAPITagsScraper/sources/__init__.py @@ -0,0 +1,10 @@ +from .genTagArray import genTagArray +from .kaitai import _kaitai +from .mfmy_mfoy import mfmy, mfoy +from .OxProps import oxprops +from .ptags import ptags +from .openchange import OpenChange + +sources = (oxprops, mfmy, mfoy, genTagArray, ptags, OpenChange) + +sources = {s.name: s for s in sources} diff --git a/MAPITagsScraper/sources/genTagArray.py b/MAPITagsScraper/sources/genTagArray.py new file mode 100644 index 0000000..2481b4e --- /dev/null +++ b/MAPITagsScraper/sources/genTagArray.py @@ -0,0 +1,35 @@ +import re +from ast import literal_eval + +from ..consts import GitHubRawBase +from ..KSEnumValue import KSEnumValue +from ..nameNormalizer import canonicalizeOrigName, convertName +from ..Source import Source + +parserGenTagRecordRx = re.compile(r"\s*\{\s*(0x[\da-f]+)\s*,\s*(0x[\da-f]+)\s*,\s*L?\"(\w+)\"\s*\}\s*(?:,\s*)") + + +def parseGenTagArrayLines(headerFileLines): + for l in headerFileLines: + m = parserGenTagRecordRx.match(l) + if m: + yield m.groups() + + +def KSEnumValueFromGenTagArrayTriple(valueStr, typeStr, nameStr): + origName = canonicalizeOrigName(nameStr) + name = convertName(origName) + rawValue = literal_eval(valueStr) + value = rawValue >> (8 * 2) + + return KSEnumValue(name, value, origName, None, None, None) + + +class GenTagArraySource(Source): + __slots__ = () + + def parseValuesFromSrc(self, src): + return [KSEnumValueFromGenTagArrayTriple(*el) for el in parseGenTagArrayLines(src.splitlines())] + + +genTagArray = GenTagArraySource("genTagArray", "genTagArray.h", GitHubRawBase + "/stephenegriffin/mfcmapi/151856e6ef5af42368a49a1340060aa58d981e8e/core/interpret/genTagArray.h", "MIT") diff --git a/MAPITagsScraper/sources/kaitai.py b/MAPITagsScraper/sources/kaitai.py new file mode 100644 index 0000000..4ea816b --- /dev/null +++ b/MAPITagsScraper/sources/kaitai.py @@ -0,0 +1,157 @@ +import typing +from pathlib import Path + +from ..KSEnumValue import KSEnumValue +from ..Source import ProtoSource +from ..utils import sortedDictByKey +from ..utils.yaml import dumpYaml + + +def KSEnumValueFromKSEnumDictKeyValuePair(key, value): + if isinstance(value, str): + oid = [] + doc = None + iD = value + else: + oid = value.get("-orig-id", []) + if isinstance(oid, str): + oid = [oid] + doc = value.get("doc", None) + iD = value["id"] + + return KSEnumValue(iD, key, oid, doc) + + +class SerializingContext: + __slots__ = ("parent", "ks", "enums", "enum", "meta", "docRef") + + def __init__(self, parent, src): + self.parent = parent + + import ruamel.yaml + + y = ruamel.yaml.YAML(typ="rt") + ks = y.load(src) + if ks is None: # empty file, usually when serialization has failed + ks = y.load(parent.getTemplateFilePath().read_text("utf-8")) + self.ks = ks + + meta = ks.get("meta", None) + if meta is None: + ks["meta"] = meta = ruamel.yaml.comments.CommentedMap() + self.meta = meta + + meta["id"] = self.parent.traditionalFileNameStem + + docRef = ks.get("doc-ref", None) + if meta is None: + ks["doc-ref"] = docRef = ruamel.yaml.comments.CommentedSeq() + self.docRef = docRef + + e = ks.get("enums", None) + if e is None: + ks["enums"] = e = ruamel.yaml.comments.CommentedMap() + self.enums = e + + enumName = self.parent.enumName + ee = e.get(enumName, None) + if ee is None: + e[enumName] = ee = ruamel.yaml.comments.CommentedMap() + + self.enum = ee + + def sortByKey(self): + self.enums[self.parent.enumName] = t = sortedDictByKey(self.enum) + + def parseValues(self): + return [KSEnumValueFromKSEnumDictKeyValuePair(k, v) for k, v in self.enum.items()] + + @classmethod + def intoKSEnumDict(cls, ksEnumValue: KSEnumValue, enumInstanceDict, merge=True): + oid = ksEnumValue.origIds + enumInstanceDict["id"] = ksEnumValue.id + + if oid: + if len(oid) == 1: + oid = oid[0] + + enumInstanceDict["-orig-id"] = oid + else: + if merge and "-orig-id" in enumInstanceDict: + del enumInstanceDict["-orig-id"] + + if ksEnumValue.doc: + enumInstanceDict["doc"] = ksEnumValue.doc + else: + if merge and "doc" in enumInstanceDict: + del enumInstanceDict["doc"] + + if ksEnumValue.companion: + enumInstanceDict["-companion"] = ksEnumValue.companion + + if ksEnumValue.subject: + enumInstanceDict["-subject"] = ksEnumValue.subject + + def decorateInt(self, i): + import ruamel.yaml + + return ruamel.yaml.scalarint.HexInt(i, width=self.parent.hexWidth) + + def insertSource(self, uri, license): + # ruamel.yaml.tokens.CommentToken(value, start_mark, end_mark) + idx = len(self.docRef) + self.docRef.append(uri) + self.docRef.yaml_add_eol_comment(license, idx, column=0) + + def enumValues2KSEnumDict(self, enumValues: typing.Iterable[KSEnumValue]): + import ruamel.yaml + + for el in enumValues: + k = self.decorateInt(el.value) + v = self.enum.get(k, None) + if v is None: + v = ruamel.yaml.comments.CommentedMap() + self.enum[k] = v + + self.__class__.intoKSEnumDict(el, v, merge=True) + + def dump(self, outputDir): + dumpYaml(self.ks, outputDir / self.parent.traditionalFileName) + + +class KaitaiSource(ProtoSource): + __slots__ = ("enumName", "traditionalFileNameStem", "hexWidth") + + def __init__(self, name, traditionalFileNameStem, enumName, hexWidth): + self.traditionalFileNameStem = traditionalFileNameStem + super().__init__(name, traditionalFileNameStem + ".ksy") + self.enumName = enumName + self.hexWidth = hexWidth + + def _getCtxFromSrc(self, src): + return SerializingContext(self, src) + + def _getCtxFromDir(self, fileDir: Path): + return self._getCtxFromSrc(self.fetch(fileDir)) + + def parseValuesFromSrc(self, src): + return self._getCtxFromSrc(src).parseValues() + + def fetch(self, fileDir: Path) -> str: + ksyFile = fileDir / self.traditionalFileName + ksyFileToLoad = None + if ksyFile.exists(): + ksyFileToLoad = ksyFile + else: + ksyFileToLoad = self.getTemplateFilePath() + + return ksyFileToLoad.read_text(encoding="utf-8") + + def getTemplatefileName(self): + return self.traditionalFileNameStem + ".template.ksy" + + def getTemplateFilePath(self): + return Path(__file__).parent.parent / self.getTemplatefileName() + + +_kaitai = KaitaiSource("_kaitai", "mapi_tags", "tag", 4) diff --git a/MAPITagsScraper/sources/mfmy_mfoy.py b/MAPITagsScraper/sources/mfmy_mfoy.py new file mode 100644 index 0000000..1881f0a --- /dev/null +++ b/MAPITagsScraper/sources/mfmy_mfoy.py @@ -0,0 +1,56 @@ +from ..consts import GitHubRawBase +from ..KSEnumValue import KSEnumValue +from ..Source import Source +from ..nameNormalizer import prepareNamesAndOrigIds + +MFMRepo = "hfig/MAPI" +MFMRepoSchemaPath = "src/MAPI/Schema/" +MFMRepoBranch = "master" + +MFMRepoSchemaBase = GitHubRawBase + MFMRepo + "/" + MFMRepoBranch + "/" + MFMRepoSchemaPath + + +class MFMSource(Source): + __slots__ = () + + def processItem(self, k, v_origIds): + v_origIds, names = prepareNamesAndOrigIds(v_origIds, True) + return KSEnumValue("_or_".join(names), k, v_origIds, None, None, None) + + def processItems(self, enumValuesDictItems): + for k, v in enumValuesDictItems: + yield self.processItem(k, v) + + def parseValuesFromYaml(self, y): + raise NotImplementedError + + def parseValuesFromSrc(self, src): + import ruamel.yaml + + y = ruamel.yaml.YAML(typ="safe") + y = y.load(src) + return self.parseValuesFromYaml(y) + + +class MFMYSource(MFMSource): + __slots__ = () + + def parseValuesFromYaml(self, y): + itemsToProcess = (((int(k, 16), v[0:1]) for k, v in y.items())) + return self.processItems(itemsToProcess) + + +from icecream import ic + +class MFOYSource(MFMSource): + __slots__ = () + + def parseValuesFromYaml(self, y): + del y["PS_PUBLIC_STRINGS"] + for typeName, enumValuesDict in y.items(): + itemsToProcess = ((k, (v,)) for k, v in enumValuesDict.items()) + yield from self.processItems(itemsToProcess) + + +mfoy = MFOYSource("mfoy", "MapiFieldsOther.yaml", MFMRepoSchemaBase + "MapiFieldsOther.yaml", "MIT") +mfmy = MFMYSource("mfmy", "MapiFieldsMessage.yaml", MFMRepoSchemaBase + "MapiFieldsMessage.yaml", "MIT") diff --git a/MAPITagsScraper/sources/openchange.py b/MAPITagsScraper/sources/openchange.py new file mode 100644 index 0000000..80f3544 --- /dev/null +++ b/MAPITagsScraper/sources/openchange.py @@ -0,0 +1,127 @@ +from ..consts import GitHubRawBase +from ..KSEnumValue import KSEnumValue +from ..Source import Source +from ..nameNormalizer import prepareNamesAndOrigIds +from .ptags import parseValueFromSourceEnumStr +from .OxProps import SectionDict2Enum + +from typed_ast import ast27 +from typed_ast import ast3 as ast +from typed_ast.conversions import py2to3 +import re +from ast import literal_eval + +names1 = {"temporary_private_tags_struct", "knownpropsets", "extra_private_tags_struct", "temporary_private_tags", "temporary_private_tags_struct"} + + +def correctLiteralEval(v): + if isinstance(v, dict): + return {correctLiteralEval(k): correctLiteralEval(v) for k, v in v.items()} + elif isinstance(v, list): + return [correctLiteralEval(el) for el in v] + elif isinstance(v, bytes): + return v.decode("utf-8") + return v + + +def correctedLiteralEval(a): + return correctLiteralEval(ast.literal_eval(a)) + + +def extractGroup1OfNames(sourceAST): + res = {} + for ael in sourceAST.body: + if isinstance(ael, ast.Assign) and len(ael.targets) == 1: + nm = ael.targets[0].id + if nm in names1: + res[nm] = correctedLiteralEval(ael.value) + return res + + +def if1LevelAttr(firstLevel: str, secondLevel: str): + def res(funcExpr) -> bool: + # ic(firstLevel, secondLevel, isinstance(funcExpr, ast.Attribute), isinstance(funcExpr.value, ast.Name), funcExpr.value.id == firstLevel, funcExpr.attr == secondLevel) + return isinstance(funcExpr, ast.Attribute) and isinstance(funcExpr.value, ast.Name) and funcExpr.value.id == firstLevel and funcExpr.attr == secondLevel + + return res + + +ifPropertiesAppend = if1LevelAttr("properties", "append") +ifAltnamelinesAppend = if1LevelAttr("altnamelines", "append") +ifFWrite = if1LevelAttr("f", "write") +ifStrLjust = if1LevelAttr("string", "ljust") + + +class OpenChangeSectionDict2Enum(SectionDict2Enum): + __slots__ = () + + CAN_NAME = "CanonicalName" + PROP_ID_NAME = "PropertyId" + ALT_NAMES_NAME = "AlternateNames" + + +sectDict2Enum = OpenChangeSectionDict2Enum() + + +def parsePropertiesAppend(xprs): + for el in xprs: + v = el.value + if ifPropertiesAppend(v.func): + r = correctedLiteralEval(v.args[0]) + origIds, value, doc = sectDict2Enum(r, []) + origId, name = next(zip(*prepareNamesAndOrigIds(origIds))) + yield KSEnumValue(name, value, origId, doc, None, None) + + +def parseAltnamelinesAppend(xprs): + for el in xprs: + v = el.value + if ifAltnamelinesAppend(v.func): + res = parseValueFromSourceEnumStr(correctedLiteralEval(v.args[0]).replace("\n", "")) + if res: + yield res + + +from icecream import ic + + +def parseFWrites(xprs): + for el in xprs: + v = el.value + if ifFWrite(v.func): + a = v.args[0] + if isinstance(a, ast.BinOp): + l = a.left + if isinstance(l, ast.BinOp): + lr = l.right + if isinstance(lr, ast.Call) and ifStrLjust(lr.func): + r = a.right + name = correctedLiteralEval(lr.args[0]) + val = correctedLiteralEval(r).strip() + if val[0] == "=" and val[-1] == ",": + val = val[1:-1].strip() + val = correctedLiteralEval(val) + origId, name = next(zip(*prepareNamesAndOrigIds([name]))) + yield KSEnumValue(name, val, origId, None, None, None) + + +def parseMMPF(sourceAST): + mmpfB = [el for el in sourceAST.body if isinstance(el, ast.FunctionDef) and el.name == "make_mapi_properties_file"][0].body + xprs = [el for el in mmpfB if isinstance(el, ast.Expr) and isinstance(el.value, ast.Call)] + yield from parsePropertiesAppend(xprs) + yield from parseAltnamelinesAppend(xprs) + yield from parseFWrites(xprs) + + +class OpenChangeSource(Source): + __slots__ = () + + def parseValuesFromSrc(self, src): + src = py2to3(ast27.parse(src)) + for l in extractGroup1OfNames(src)["temporary_private_tags"].splitlines(): + if l: + yield parseValueFromSourceEnumStr(l) + yield from parseMMPF(src) + + +OpenChange = OpenChangeSource("OpenChange", "makepropslist.py", "https://raw.githubusercontent.com/zentyal/openchange/master/script/makepropslist.py", "GPL-3.0") diff --git a/MAPITagsScraper/sources/ptags.py b/MAPITagsScraper/sources/ptags.py new file mode 100644 index 0000000..087202f --- /dev/null +++ b/MAPITagsScraper/sources/ptags.py @@ -0,0 +1,67 @@ +from ..consts import GitHubRawBase +from ..KSEnumValue import KSEnumValue +from ..Source import Source +from ..nameNormalizer import prepareNamesAndOrigIds + +import ast +import re +import typing + +import simpleeval + +defineRx = re.compile("^\\s*#define\\s+(?P\w+)\\s+(?P.+)\\s*$") +removeEndNoRx = re.compile("^(\w+?)(?:_\d+)?$") + +evaluator = simpleeval.SimpleEval() +allowedNodeTypes = {ast.Constant, ast.Num, ast.UnaryOp, ast.BinOp, ast.BinOp, ast.Compare, ast.Expr, ast.Tuple, ast.BitOr} +evaluator.nodes = {nt: cb for nt, cb in evaluator.nodes.items() if nt in allowedNodeTypes} + + +def parseValueFromSourceEnumStr(l: str) -> typing.Optional[KSEnumValue]: + m = defineRx.match(l) + if m: + origId = removeEndNoRx.match(m.group("name")).group(1) + payload = m.group("payload") + ppld = payload + payload = payload.rsplit(")", 1) + typ = None + valueRaw = None + + if len(payload) > 1: + payload = "".join(payload[:-1]) + if payload: + v = payload.replace("PROP_TAG(", "").replace("(ULONG)", "").split(",") + if len(v) == 2: + typ, valueRaw = v + else: + print(v) + else: + print(l) + else: + valueRaw = payload[0] + + if valueRaw: + try: + value = evaluator.eval(valueRaw) + except simpleeval.FeatureNotAvailable: + pass + else: + # typ = evaluator.eval(typ) # not needed + origId, name = next(zip(*prepareNamesAndOrigIds([origId]))) + return KSEnumValue(name, value, origId, None, None, None) + + +class PTagsSource(Source): + __slots__ = () + + def parseValuesFromSrc(self, src: str) -> typing.Iterator[KSEnumValue]: + for l in src.splitlines(): + if not l.startswith("// In File:"): + v = parseValueFromSourceEnumStr(l) + if v: + yield v + else: + print("ptags line not parsed:", l) + + +ptags = PTagsSource("ptags", "ptags.h", GitHubRawBase + "dbremner/pstviewtool/52f59893ad4390358053541b0257b4a7f2767024/ptags.h", "Likely Apache. The repo contains no license, but the news (https://www.infoq.com/news/2010/05/Outlook-PST-View-Tool-and-SDK/, also https://web.archive.org/web/20140704101722/http://www.microsoft.com/en-us/news/press/2010/may10/05-24psttoolspr.aspx) claim that this tool and https://github.com/enrondata/pstsdk were published under Apache. Looks plausible since both software were authored by Terry Mahaffey (psviewtool has user name terrymah (though without a proper email) in git commits, likely the same guy as https://github.com/terrymah, pstsdk has the lines `author Terry Mahaffey`)") diff --git a/MAPITagsScraper/utils/__init__.py b/MAPITagsScraper/utils/__init__.py new file mode 100644 index 0000000..682c9da --- /dev/null +++ b/MAPITagsScraper/utils/__init__.py @@ -0,0 +1,13 @@ +import typing + + +def dedupPreservingOrder(args: typing.Iterable[str]) -> typing.Iterator[str]: + dedup = set() + for el in args: + if el not in dedup: + dedup.add(el) + yield el + + +def sortedDictByKey(dic): + return dic.__class__(sorted(dic.items(), key=lambda x: x[0])) diff --git a/MAPITagsScraper/utils/yaml.py b/MAPITagsScraper/utils/yaml.py new file mode 100644 index 0000000..8fd9144 --- /dev/null +++ b/MAPITagsScraper/utils/yaml.py @@ -0,0 +1,12 @@ +from pathlib import Path + + +def dumpYaml(ksDocument, outFile: Path): + from ruamel.yaml import YAML + + y = YAML(typ="rt") + y.width = 100500 + y.indent(2, 4, 2) + + with outFile.open("wt", encoding="utf-8") as f: + y.dump(ksDocument, f) diff --git a/ReadMe.md b/ReadMe.md new file mode 100644 index 0000000..717f44e --- /dev/null +++ b/ReadMe.md @@ -0,0 +1,11 @@ +MAPITagsScraper.py [![Unlicensed work](https://raw.githubusercontent.com/unlicense/unlicense.org/master/static/favicon.png)](https://unlicense.org/) +=============== +~~[wheel (GitLab)](https://gitlab.com/KOLANICH-tools/MAPITagsScraper.py/-/jobs/artifacts/master/raw/dist/MAPITagsScraper-0.CI-py3-none-any.whl?job=build)~~ +~~[wheel (GHA via `nightly.link`)](https://nightly.link/KOLANICH-tools/MAPITagsScraper.py/workflows/CI/master/MAPITagsScraper-0.CI-py3-none-any.whl)~~ +~~![GitLab Build Status](https://gitlab.com/KOLANICH-tools/MAPITagsScraper.py/badges/master/pipeline.svg)~~ +~~![GitLab Coverage](https://gitlab.com/KOLANICH-tools/MAPITagsScraper.py/badges/master/coverage.svg)~~ +~~[![GitHub Actions](https://github.com/KOLANICH-tools/MAPITagsScraper.py/workflows/CI/badge.svg)](https://github.com/KOLANICH-tools/MAPITagsScraper.py/actions/)~~ +[![Libraries.io Status](https://img.shields.io/librariesio/github/KOLANICH-tools/MAPITagsScraper.py.svg)](https://libraries.io/github/KOLANICH-tools/MAPITagsScraper.py) +[![Code style: antiflash](https://img.shields.io/badge/code%20style-antiflash-FFF.svg)](https://github.com/KOLANICH-tools/antiflash.py) + +Extracts MAPI tags from MS docs and other sources. diff --git a/UNLICENSE b/UNLICENSE new file mode 100644 index 0000000..efb9808 --- /dev/null +++ b/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..7bdab85 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,50 @@ +[build-system] +requires = ["setuptools>=61.2.0", "wheel", "setuptools_scm[toml]>=3.4.3"] +build-backend = "setuptools.build_meta" + +[project] +name = "MAPITagsScraper" +authors = [{name = "KOLANICH"}] +description = "Extracts MAPI tags identifiers from MS docs" +readme = "ReadMe.md" +keywords = ["MAPITagsScraper"] +license = {text = "Unlicense"} +classifiers = [ + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Development Status :: 4 - Beta", + "Environment :: Other Environment", + "Intended Audience :: Developers", + "License :: Public Domain", + "Operating System :: OS Independent", + "Topic :: Software Development :: Libraries :: Python Modules", +] +requires-python = ">=3.4" +dependencies = [ + "WordSplitAbs @ https://codeberg.org/KOLANICH-libs/WordSplitAbs.py", + "commonmark @ git+https://github.com/readthedocs/commonmark.py", + "recommonmark @ git+https://github.com/readthedocs/recommonmark.git", + "docutils", + "dom_query @ git+https://gitlab.com/geusebi/dom_query.git", + "inflection @ git+https://github.com/jpvanhal/inflection.git", + "ruamel.yaml", +] +dynamic = ["version"] + +[project.optional-dependencies] +header = ["simpleeval @ git+https://github.com/danthedeckie/simpleeval.git"] + +[project.urls] +Homepage = "https://codeberg.org/KOLANICH-tools/MAPITagsScraper.py" + +[project.scripts] +MAPITagsScraper = "MAPITagsScraper.__main__:CLI.run" + +[tool.setuptools] +zip-safe = true +include-package-data = false + +[tool.setuptools.packages] +find = {namespaces = false} + +[tool.setuptools_scm]