Skip to content

Commit

Permalink
extraction: add heuristics and explicit 3.13 support (#173)
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Nov 27, 2024
1 parent 3ea88b3 commit 69fea3b
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 2 deletions.
5 changes: 3 additions & 2 deletions htmldate/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@
contains(@class, 'fa-clock-o') or
contains(@class, 'fa-calendar') or
contains(@class, 'fecha') or
contains(@class, 'parution')
contains(@class, 'parution') or
contains(@id, 'footer-info-lastmod')
] |
.//footer | .//small
"""
Expand Down Expand Up @@ -173,7 +174,7 @@

# use of regex module for speed?
TEXT_PATTERNS = re.compile(
r'(?:date[^0-9"]{,20}|updated|published|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r'(?:date[^0-9"]{,20}|updated|last-modified|published|posted|on)(?:[ :])*?([0-9]{1,4})[./]([0-9]{1,2})[./]([0-9]{2,4})|' # EN
r"(?:Datum|Stand|Veröffentlicht am):? ?([0-9]{1,2})\.([0-9]{1,2})\.([0-9]{2,4})|" # DE
r"(?:güncellen?me|yayı(?:m|n)lan?ma) *?(?:tarihi)? *?:? *?([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4})|"
r"([0-9]{1,2})[./]([0-9]{1,2})[./]([0-9]{2,4}) *?(?:'de|'da|'te|'ta|’de|’da|’te|’ta|tarihinde) *(?:güncellendi|yayı(?:m|n)landı)", # TR
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Scientific/Engineering :: Information Analysis",
"Topic :: Text Processing :: Linguistic",
Expand Down

0 comments on commit 69fea3b

Please sign in to comment.