Skip to content

Commit

Permalink
use regex to parse publish date
Browse files Browse the repository at this point in the history
  • Loading branch information
mawenbao committed Mar 17, 2014
1 parent 7d7526d commit be495c3
Show file tree
Hide file tree
Showing 10 changed files with 199 additions and 98 deletions.
4 changes: 4 additions & 0 deletions HISTORY.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Gofeed发布历史

## v0.1.5 2014.03.17

* 使用正则表达式处理发布时间。

## v0.1.4 2014.02.17

* 修复cache里存在超过CacheLifeTime条目的bug。
Expand Down
15 changes: 12 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,17 +53,17 @@ See `example_config.json` and `example_config2.json`.
* Feed.IndexPattern: (array of strings) array of index patterns, used to extract entry link and entry title from the filtered content by Feed.IndexFilterPattern.
* Feed.ContentFilterPattern: (array of strings) array of content patterns, used to extract valid content html from the entire html identified by {link}.
* Feed.ContentPattern: (array of strings) array of content patterns, used to extract entry description from the entry's filtered html content by Feed.ContentFilterPattern.
* Feed.PubDateFormat: (string) string format of publish date, see these [predefined format constants](http://golang.org/pkg/time/#pkg-constants) for example.
* Feed.PubDatePattern: (string) pattern of publish date, see pre-defined patterns. Used to extract publish date of an article from the string extracted from {pubdate} pattern.

And you should note that

1. There should be as many Feed.URL as Feed.IndexPattern. If array length of the two does not match, there should be only one Feed.IndexPattern or only one Feed.URL, which means all the Feed.URL will share the same Feed.IndexPattern or all the Feed.IndexPattern share the same Feed.URL. Otherwise, an configuration parse error will return.

2. For Feed.ContentPattern, there should be as many Feed.URL as Feed.ContentPattern. If array length of the two does not match, there should be only one Feed.ContentPattern, which means all the Feed.URL will share the same Feed.ContentPattern. And the same goes for Feed.PubDateFormat.

3. Both Feed.IndexPattern and Feed.ContentPattern can contain {pubdate} pattern, and if {pubdate} exists in both, gofeed will use the Feed.ContentPattern's.
3. Either Feed.IndexPattern or Feed.ContentPattern can contain the {pubdate} pattern, but not both.

### Predefined patterns
### Pre-defined patterns
You can use the following predefined patterns in `Feed.IndexPattern` and `Feed.ContentPattern` of the json configuration. Note that all these patterns are **lazy** and perform **leftmost** match, which means they will match as few characters as possible.

* {any}: match any character including newline
Expand All @@ -73,6 +73,15 @@ You can use the following predefined patterns in `Feed.IndexPattern` and `Feed.C
* {pubdate}: publish date of feed entry
* {filter}: filtered content, used in Feed.IndexFilterPattern or Feed.ContentFilterPattern

Date time format pattern, currently used for publish date string extraced from the {pubdate} pattern:

* {year}: must be an integer
* {month}: must be an integer, Jan or January.
* {day}: must be an integer
* {hour}: must be an integer
* {minute}: must be an integer
* {second}: must be an integer

### Custom regular expressions
You can also write custom regex in `Feed.IndexPattern` and `Feed.ContentPattern`. Make sure there are no predefined patterns in your custom regular expressions. The regex syntax documentation can be found [here](https://code.google.com/p/re2/wiki/Syntax).

Expand Down
15 changes: 7 additions & 8 deletions config.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,24 +89,23 @@ func ParseJsonConfig(path string) (feedTargets []*FeedTarget) {
feedTar.Title = filepath.Base(tar.FeedPath)
}

// check patterns
// check index/content patterns
if !CheckPatterns(tar) {
log.Fatal("error parsing configuration: empty index/content pattern")
}

// check pubdate patterns
pubDateNum := len(tar.PubDatePatterns)
if 0 != pubDateNum && 1 != pubDateNum && len(tar.URLs) != pubDateNum {
log.Fatalf("failed to parse pubdate patterns, number of pubdate should be 0 or 1 or the same as Feed.URL")
}

// compile patterns
err = CompilePatterns(feedTar, tar)
if nil != err {
log.Fatalf("failed to compile index/content patterns for feed target %s: %s", feedTar.FeedPath, err)
}

// check pubdate format
pubDateNum := len(tar.PubDateFormats)
if 0 != pubDateNum && 1 != pubDateNum && len(tar.URLs) != pubDateNum {
log.Fatalf("failed to parse pubdate formats, number of pubdate should be 0 or 1 or the same as Feed.URL")
}
feedTar.PubDateFormats = tar.PubDateFormats

// normalize url
if 0 == len(tar.URLs) {
log.Fatalf("no urls for %s", tar.FeedPath)
Expand Down
39 changes: 16 additions & 23 deletions define.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import (

const (
GOFEED_NAME = "gofeed"
GOFEED_VERSION = "0.1.4"
GOFEED_VERSION = "0.1.5"
GOFEED_PROJECT = "https://github.com/mawenbao/gofeed"

// used to normalize urls
Expand All @@ -28,29 +28,22 @@ const (
CACHE_LIFETIME_ALL_REG = `^([1-9][0-9]*[smhd])+$`
CACHE_LIFETIME_REG = `([1-9][0-9]*)([smhd])`

TITLE_NAME = "title"
PATTERN_TITLE = "{" + TITLE_NAME + "}"
PATTERN_TITLE_REG = `(?P<` + TITLE_NAME + `>(?s).+?)`

LINK_NAME = "link"
PATTERN_LINK = "{" + LINK_NAME + "}"
PATTERN_LINK_REG = `(?P<` + LINK_NAME + `>(?s).+?)`

CONTENT_NAME = "description"
PATTERN_CONTENT = "{" + CONTENT_NAME + "}"
PATTERN_CONTENT_REG = `(?P<` + CONTENT_NAME + `>(?s).*?)`

PUBDATE_NAME = "pubdate"
PATTERN_PUBDATE = "{" + PUBDATE_NAME + "}"
PATTERN_PUBDATE_REG = `(?P<` + PUBDATE_NAME + `>(?s).*?)`

FILTER_NAME = "filter"
PATTERN_FILTER = "{" + FILTER_NAME + "}"
PATTERN_FILTER_REG = `(?P<` + FILTER_NAME + `>(?s).+?)`

PATTERN_ANY = "{any}"
PATTERN_ANY_REG = "(?s).*?"

PATTERN_TITLE = "title"
PATTERN_LINK = "link"
PATTERN_CONTENT = "description"
PATTERN_PUBDATE = "pubdate"
PATTERN_FILTER = "filter"

PATTERN_YEAR = "year"
PATTERN_MONTH = "month"
PATTERN_DAY = "day"
PATTERN_HOUR = "hour"
PATTERN_MINUTE = "minute"
PATTERN_SECOND = "second"

// db related consts
DB_DRIVER = "sqlite3"
DB_NAME = "cache.db"
Expand Down Expand Up @@ -92,7 +85,7 @@ type TargetConfig struct {
ContentPatterns []string `json:"Feed.ContentPattern"`
IndexFilterPatterns []string `json:"Feed.IndexFilterPattern"`
ContentFilterPatterns []string `json:"Feed.ContentFilterPattern"`
PubDateFormats []string `json:"Feed.PubDateFormat"`
PubDatePatterns []string `json:"Feed.PubDatePattern"`
FeedPath string `json:"Feed.Path"`
ReqInterval time.Duration `json:"Request.Interval"`
}
Expand All @@ -105,11 +98,11 @@ type FeedTarget struct {
ContentRegs []*regexp.Regexp
IndexFilterRegs []*regexp.Regexp
ContentFilterRegs []*regexp.Regexp
PubDateRegs []*regexp.Regexp
FeedPath string
ReqInterval time.Duration
CacheDB string
CacheLifetime time.Duration
PubDateFormats []string
}

type Feed struct {
Expand Down
8 changes: 4 additions & 4 deletions example_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"Feed.URL": ["http://www.huxiu.com/rss/0.xml"],
"Feed.IndexPattern": ["<item><title>{title}</title>{any}<link>{link}</link>{any}<pubDate>{pubdate}</pubDate>"],
"Feed.ContentPattern": ["<div{any}id=\"neirong_box\"{any}<table>{any}<div>{description}<!--"],
"Feed.PubDateFormat": ["Mon, 02 Jan 2006 15:04:05 -0700"],
"Feed.PubDatePattern": ["{any}, {day} {month} {year} {hour}:{minute}:{second}"],
"Feed.Path": "huxiu.xml",
"Request.Interval": 5
},
Expand All @@ -16,7 +16,7 @@
"Feed.URL": ["http://www.timetimetime.net"],
"Feed.IndexPattern": ["<div class=\"left_contant_title\">{any}(经典语录|阅读生活|另一面|读好书){any}<a href=\"{link}\"{any}>{title}</a>"],
"Feed.ContentPattern": ["</h1><div{any}<span{any}>{pubdate}<span{any}<div class=\"neiz1 a7\">{any}</div>{description}</div>"],
"Feed.PubDateFormat": ["2006-01-02 15:04"],
"Feed.PubDatePattern": ["{year}-{month}-{day} {hour}:{minute}"],
"Feed.Path": "timetimetime.xml",
"Request.Interval": 5
},
Expand All @@ -25,7 +25,7 @@
"Feed.URL": ["http://www.hexieshe.com/feed"],
"Feed.IndexPattern": ["<item><title>{title}</title><link>{link}</link>{any}<pubDate>{pubdate}</pubDate>"],
"Feed.ContentPattern": ["<article{any}<div class=\"entry\">{description}<div"],
"Feed.PubDateFormat": ["Mon, 02 Jan 2006 15:04:05 -0700"],
"Feed.PubDatePattern": ["{any}, {day} {month} {year} {hour}:{minute}:{second}"],
"Feed.Path": "hexieshe.xml",
"Request.Interval": 5
},
Expand Down Expand Up @@ -53,7 +53,7 @@
"<div id=\"t_0\"{any}>{filter}</div>"
],
"Feed.ContentPattern": ["<em class=\"pubTime\">(最后更新:)?{pubdate}</em>{any}id=\"articleContent\">{description}(<!--|</section>)"],
"Feed.PubDateFormat": ["2006-01-02 15:04:05"],
"Feed.PubDatePattern": ["{year}-{month}-{day} {hour}:{minute}:{second}"],
"Feed.Path": "nanfangzhoumo.xml",
"Request.Interval": 5
}
Expand Down
3 changes: 2 additions & 1 deletion example_config2.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
"Feed.Title": "mwb's blog",
"Feed.URL": ["http://blog.atime.me", "blog.atime.me/index2.html"],
"Feed.IndexPattern": ["<div class=\"niu2-index-article-title\"><span><a href=\"{link}\">{title}</a></span></div>", "<div class=\"niu2-index-article-title\"><span><a href=\"{link}\">{title}</a></span></div>"],
"Feed.ContentPattern": ["<div class=\"clearfix visible-xs niu2-clearfix\"></div>{description}<div id=\"content-comments\">"],
"Feed.ContentPattern": ["<div id=\"niu2-main-content\">{description}<div id=\"content-comments\">{any}<div id=\"niu2-sidebar-meta\"{any}<div class=\"niu2-sidebar-value\">{pubdate}</div>"],
"Feed.Path": "blog.atime.me.xml",
"Feed.PubDatePattern": ["{year}-{month}-{day} {hour}:{minute}:{second}"],
"Request.Interval": 0
}
]
Expand Down
2 changes: 1 addition & 1 deletion filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ func RegexpFilter(filterReg *regexp.Regexp, data []byte) (outdata []byte) {
for _, match := range matches {
for patInd, patName := range filterReg.SubexpNames() {
switch patName {
case FILTER_NAME:
case PATTERN_FILTER:
outdata = append(outdata, match[patInd]...)
}
}
Expand Down
14 changes: 7 additions & 7 deletions html_parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,17 +104,17 @@ func ParseIndexHtml(feedTar *FeedTarget) (feed *Feed, ok bool) {
entry.IndexPattern = indexReg
for patInd, patName := range indexReg.SubexpNames() {
switch patName {
case TITLE_NAME:
case PATTERN_TITLE:
entry.Title = string(match[patInd])
case LINK_NAME:
case PATTERN_LINK:
// normalize entry link which may be relative
entry.Link, err = tarURL.Parse(string(match[patInd]))
if nil != err {
log.Printf("[ERROR] error parsing entry link %s: %s", entry.Link, err)
}
case PUBDATE_NAME:
case PATTERN_PUBDATE:
var pubDate time.Time
pubDate, err = ParsePubDate(FindPubDate(feedTar, tarURL), string(match[patInd]))
pubDate, err = ParsePubDate(FindPubDateReg(feedTar, feed.URL), string(match[patInd]))
if nil != err {
log.Printf("[ERROR] error parsing pubdate of link %s: %s", entry.Link, err)
} else {
Expand Down Expand Up @@ -213,11 +213,11 @@ func ParseContentHtml(feedTar *FeedTarget, feed *Feed) (ok bool) {
}
for patInd, patName := range contentReg.SubexpNames() {
switch patName {
case CONTENT_NAME:
case PATTERN_CONTENT:
entry.Content = match[patInd]
case PUBDATE_NAME:
case PATTERN_PUBDATE:
var pubDate time.Time
pubDate, err = ParsePubDate(FindPubDate(feedTar, feed.URL), string(match[patInd]))
pubDate, err = ParsePubDate(FindPubDateReg(feedTar, feed.URL), string(match[patInd]))
if nil != err {
log.Printf("[ERROR] error parsing pubdate of link %s: %s", entry.Link, err)
} else {
Expand Down
56 changes: 39 additions & 17 deletions pattern.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,18 +11,25 @@ import (
func PatternToRegex(pat string) string {
r := strings.NewReplacer(
PATTERN_ANY, PATTERN_ANY_REG,
PATTERN_TITLE, PATTERN_TITLE_REG,
PATTERN_LINK, PATTERN_LINK_REG,
PATTERN_CONTENT, PATTERN_CONTENT_REG,
PATTERN_PUBDATE, PATTERN_PUBDATE_REG,
PATTERN_FILTER, PATTERN_FILTER_REG,
GenPDPName(PATTERN_TITLE), GenPDPRegexStr(PATTERN_TITLE, true),
GenPDPName(PATTERN_LINK), GenPDPRegexStr(PATTERN_LINK, true),
GenPDPName(PATTERN_CONTENT), GenPDPRegexStr(PATTERN_CONTENT, false),
GenPDPName(PATTERN_FILTER), GenPDPRegexStr(PATTERN_FILTER, true),
GenPDPName(PATTERN_PUBDATE), GenPDPRegexStr(PATTERN_PUBDATE, true),
GenPDPName(PATTERN_YEAR), GenPDPRegexStr(PATTERN_YEAR, true),
GenPDPName(PATTERN_MONTH), GenPDPRegexStr(PATTERN_MONTH, true),
GenPDPName(PATTERN_DAY), GenPDPRegexStr(PATTERN_DAY, true),
GenPDPName(PATTERN_HOUR), GenPDPRegexStr(PATTERN_HOUR, true),
GenPDPName(PATTERN_MINUTE), GenPDPRegexStr(PATTERN_MINUTE, true),
GenPDPName(PATTERN_SECOND), GenPDPRegexStr(PATTERN_SECOND, true),
)

return r.Replace(pat)
}

// IndexPattern must contain both {title} and {link}, and maybe {pubdate}
// ContentPattern must contain {content}, and maybe {pubdate}
// IndexPattern must contain both {title} and {link}
// ContentPattern must contain {content}
// Either IndexPattern or ContentPattern may contain {pubdate}, but not both.
func CheckPatterns(tar *TargetConfig) bool {
if nil == tar {
log.Printf("[ERROR] invliad target, nil")
Expand All @@ -42,8 +49,8 @@ func CheckPatterns(tar *TargetConfig) bool {
return false
}

if 1 != strings.Count(indexPat, PATTERN_TITLE) || 1 != strings.Count(indexPat, PATTERN_LINK) {
log.Printf("[ERROR] index pattern %s should contain 1 %s and 1 %s ", indexPat, PATTERN_TITLE, PATTERN_LINK)
if 1 != strings.Count(indexPat, GenPDPName(PATTERN_TITLE)) || 1 != strings.Count(indexPat, GenPDPName(PATTERN_LINK)) {
log.Printf("[ERROR] index pattern %s should contain 1 %s and 1 %s ", indexPat, GenPDPName(PATTERN_TITLE), GenPDPName(PATTERN_LINK))
return false
}
}
Expand All @@ -55,13 +62,13 @@ func CheckPatterns(tar *TargetConfig) bool {
return false
}

if 1 != strings.Count(contentPat, PATTERN_CONTENT) {
log.Printf("[ERROR] content pattern %s should contain 1 %s", contentPat, PATTERN_CONTENT)
if 1 != strings.Count(contentPat, GenPDPName(PATTERN_CONTENT)) {
log.Printf("[ERROR] content pattern %s should contain 1 %s", contentPat, GenPDPName(PATTERN_CONTENT))
return false
}

if strings.Contains(contentPat, PATTERN_TITLE) || strings.Contains(contentPat, PATTERN_LINK) {
log.Printf("[ERROR] %s should not contain %s or %s", contentPat, PATTERN_TITLE, PATTERN_LINK)
if strings.Contains(contentPat, GenPDPName(PATTERN_TITLE)) || strings.Contains(contentPat, GenPDPName(PATTERN_LINK)) {
log.Printf("[ERROR] %s should not contain %s or %s", contentPat, GenPDPName(PATTERN_TITLE), GenPDPName(PATTERN_LINK))
return false
}
}
Expand All @@ -77,8 +84,8 @@ func CheckPatterns(tar *TargetConfig) bool {
if "" == indFilterPat {
continue
}
if 1 > strings.Count(indFilterPat, PATTERN_FILTER) {
log.Printf("[ERROR] index filter pattern %s should be empty or contain more than one %s", indFilterPat, PATTERN_FILTER)
if 1 > strings.Count(indFilterPat, GenPDPName(PATTERN_FILTER)) {
log.Printf("[ERROR] index filter pattern %s should be empty or contain more than one %s", indFilterPat, GenPDPName(PATTERN_FILTER))
return false
}
}
Expand All @@ -87,12 +94,14 @@ func CheckPatterns(tar *TargetConfig) bool {
if "" == contFilterPat {
continue
}
if 1 > strings.Count(contFilterPat, PATTERN_FILTER) {
log.Printf("[ERROR] content filter pattern %s should be empty or contain more than one %s", contFilterPat, PATTERN_FILTER)
if 1 > strings.Count(contFilterPat, GenPDPName(PATTERN_FILTER)) {
log.Printf("[ERROR] content filter pattern %s should be empty or contain more than one %s", contFilterPat, GenPDPName(PATTERN_FILTER))
return false
}
}

//@TODO check pubdate pattern

return true
}

Expand All @@ -101,6 +110,7 @@ func CompilePatterns(feedTar *FeedTarget, tar *TargetConfig) (err error) {
feedTar.ContentRegs = make([]*regexp.Regexp, len(tar.ContentPatterns))
feedTar.IndexFilterRegs = make([]*regexp.Regexp, len(tar.IndexFilterPatterns))
feedTar.ContentFilterRegs = make([]*regexp.Regexp, len(tar.ContentFilterPatterns))
feedTar.PubDateRegs = make([]*regexp.Regexp, len(tar.PubDatePatterns))

// index pattern
for j := 0; j < len(tar.IndexPatterns); j++ {
Expand Down Expand Up @@ -144,6 +154,18 @@ func CompilePatterns(feedTar *FeedTarget, tar *TargetConfig) (err error) {
}
}

// publish date pattern
for j := 0; j < len(tar.PubDatePatterns); j++ {
if "" == strings.TrimSpace(tar.PubDatePatterns[j]) {
continue
}
feedTar.PubDateRegs[j], err = regexp.Compile(PatternToRegex(tar.PubDatePatterns[j]))
if nil != err {
log.Printf("[ERROR] error compiling publish date pattern %s", tar.PubDatePatterns[j])
return
}
}

return
}

Expand Down
Loading

0 comments on commit be495c3

Please sign in to comment.