diff --git a/builtin.go b/builtin.go new file mode 100644 index 0000000..f56f439 --- /dev/null +++ b/builtin.go @@ -0,0 +1,97 @@ +package pagser + +import ( + "github.com/PuerkitoBio/goquery" +) + +// CallFunc write function interface + +// # Define Global Function +// +// func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { +// //Todo +// return "Hello", nil +// } +// +// //Register function +// pagser.RegisterFunc("MyFunc", MyFunc) +// +// //Use function +// type PageData struct{ +// Text string `pagser:"h1->MyFunc()"` +// } +// +// +// # Define Struct Function +// //Use function +// type PageData struct{ +// Text string `pagser:"h1->MyFunc()"` +// } +// +// func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { +// //Todo +// return "Hello", nil +// } +// +// # Lookup function priority order +// +// struct method -> parent method -> ... -> global +// +// # Implicit convert type +// +// Automatic type conversion, Output result string convert to int, int64, float64... +// +// CallFunc is a define function interface +type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error) + +//BuiltinFunctions instance +var builtinFun BuiltinFunctions + +//BuiltinSelections instance +var builtinSel BuiltinSelections + +//builtin functions +var builtinFuncMap = map[string]CallFunc{ + "absHref": builtinFun.AbsHref, + "attr": builtinFun.Attr, + "attrConcat": builtinFun.AttrConcat, + "attrEmpty": builtinFun.AttrEmpty, + "attrSplit": builtinFun.AttrSplit, + "eachAttr": builtinFun.EachAttr, + "eachAttrEmpty": builtinFun.EachAttrEmpty, + "eachHtml": builtinFun.EachHtml, + "eachOutHtml": builtinFun.EachOutHtml, + "eachText": builtinFun.EachText, + "eachTextEmpty": builtinFun.EachTextEmpty, + "eachTextJoin": builtinFun.EachTextJoin, + "eqAndAttr": builtinFun.EqAndAttr, + "eqAndHtml": builtinFun.EqAndHtml, + "eqAndOutHtml": builtinFun.EqAndOutHtml, + "eqAndText": builtinFun.EqAndText, + "html": builtinFun.Html, + "outerHtml": builtinFun.OutHtml, + "text": builtinFun.Text, + "textConcat": builtinFun.TextConcat, + "textEmpty": builtinFun.TextEmpty, + "textSplit": builtinFun.TextSplit, + // selector + "child": builtinSel.Child, + "eq": builtinSel.Eq, + "first": builtinSel.First, + "last": builtinSel.Last, + "next": builtinSel.Next, + "parent": builtinSel.Parent, + "parents": builtinSel.Parents, + "parentsUntil": builtinSel.ParentsUntil, + "prev": builtinSel.Prev, + "siblings": builtinSel.Siblings, +} + +// RegisterFunc register function for parse result +// pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { +// //Todo +// return "Hello", nil +// }) +func (p *Pagser) RegisterFunc(name string, fn CallFunc) { + p.ctxFuncs[name] = fn +} diff --git a/function.go b/builtin_functions.go similarity index 66% rename from function.go rename to builtin_functions.go index f33eaf7..c46bc56 100644 --- a/function.go +++ b/builtin_functions.go @@ -2,86 +2,36 @@ package pagser import ( "fmt" + "github.com/PuerkitoBio/goquery" "github.com/spf13/cast" + "net/url" "strconv" "strings" - - "github.com/PuerkitoBio/goquery" ) -// CallFunc write function interface - -// # Define Global Function -// -// func MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { -// //Todo -// return "Hello", nil -// } -// -// //Register function -// pagser.RegisterFunc("MyFunc", MyFunc) -// -// //Use function -// type PageData struct{ -// Text string `pagser:"h1->MyFunc()"` -// } -// -// -// # Define Struct Function -// //Use function -// type PageData struct{ -// Text string `pagser:"h1->MyFunc()"` -// } -// -// func (pd PageData) MyFunc(node *goquery.Selection, args ...string) (out interface{}, err error) { -// //Todo -// return "Hello", nil -// } -// -// # Lookup function priority order -// -// struct method -> parent method -> ... -> global -// -// # Implicit convert type -// -// Automatic type conversion, Output result string convert to int, int64, float64... -// -// CallFunc is a define function interface -type CallFunc func(node *goquery.Selection, args ...string) (out interface{}, err error) - // Builtin functions are registered with a lowercase initial, eg: Text -> text() type BuiltinFunctions struct { } -var builtinFuncObj BuiltinFunctions -var builtinFuncMap = map[string]CallFunc{ - "attr": builtinFuncObj.Attr, - "attrConcat": builtinFuncObj.AttrConcat, - "attrEmpty": builtinFuncObj.AttrEmpty, - "attrSplit": builtinFuncObj.AttrSplit, - "eachAttr": builtinFuncObj.EachAttr, - "eachAttrEmpty": builtinFuncObj.EachAttrEmpty, - "eachHtml": builtinFuncObj.EachHtml, - "eachOutHtml": builtinFuncObj.EachOutHtml, - "eachText": builtinFuncObj.EachText, - "eachTextEmpty": builtinFuncObj.EachTextEmpty, - "eachTextJoin": builtinFuncObj.EachTextJoin, - "eqAndAttr": builtinFuncObj.EqAndAttr, - "eqAndHtml": builtinFuncObj.EqAndHtml, - "eqAndOutHtml": builtinFuncObj.EqAndOutHtml, - "eqAndText": builtinFuncObj.EqAndText, - "html": builtinFuncObj.Html, - "nodeChild": builtinFuncObj.NodeChild, - "nodeEq": builtinFuncObj.NodeEq, - "nodeNext": builtinFuncObj.NodeNext, - "nodeParent": builtinFuncObj.NodeParent, - "nodePrev": builtinFuncObj.NodePrev, - "nodeSiblings": builtinFuncObj.NodeSiblings, - "outerHtml": builtinFuncObj.OutHtml, - "text": builtinFuncObj.Text, - "textConcat": builtinFuncObj.TextConcat, - "textEmpty": builtinFuncObj.TextEmpty, - "textSplit": builtinFuncObj.TextSplit, +// absHref(baseUrl) get element attribute name `href`, and convert to absolute url, return *URL. +// `baseUrl` is the base url like `https://example.com/`. +// //Pagser +// struct { +// Example string `pagser:".selector->absHref('https://github.com/')"` +// } +func (builtin BuiltinFunctions) AbsHref(selection *goquery.Selection, args ...string) (out interface{}, err error) { + if len(args) < 1 { + return "", fmt.Errorf("args must has baseUrl") + } + baseUrl, err := url.Parse(args[0]) + if err != nil { + return "", fmt.Errorf("invalid base url: %v error: %v", baseUrl, err) + } + hrefUrl, err := url.Parse(selection.AttrOr("href", "")) + if err != nil { + return "", err + } + return baseUrl.ResolveReference(hrefUrl), nil } // attr(name, defaultValue='') get element attribute value, return string. @@ -383,122 +333,6 @@ func (builtin BuiltinFunctions) Html(node *goquery.Selection, args ...string) (o return node.Html() } -// nodeChild(selector = '') gets the child elements of each element in the Selection, -// Filtered by the specified selector if selector not empty, -// It returns Selection object containing these elements for nested struct.. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodeChild()"` -// } -func (builtin BuiltinFunctions) NodeChild(node *goquery.Selection, args ...string) (out interface{}, err error) { - selector := "" - if len(args) > 0 { - selector = strings.TrimSpace(args[0]) - } - if selector != "" { - return node.ChildrenFiltered(selector), nil - } - return node.Children(), nil -} - -// nodeEq(index) reduces the set of matched elements to the one at the specified index. -// If a negative index is given, it counts backwards starting at the end of the -// set. It returns a Selection object for nested struct, and an empty Selection object if the -// index is invalid. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodeEq(0)"` -// } -func (builtin BuiltinFunctions) NodeEq(node *goquery.Selection, args ...string) (out interface{}, err error) { - if len(args) < 1 { - return "", fmt.Errorf("nodeEq(index) must has `index` value") - } - indexValue := strings.TrimSpace(args[0]) - idx, err := strconv.Atoi(indexValue) - if err != nil { - return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) - } - return node.Eq(idx), nil -} - -// nodeNext() gets the immediately following sibling of each element in the Selection. -// Filtered by the specified selector if selector not empty, -// It returns Selection object containing these elements for nested struct. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodeNext()"` -// } -func (builtin BuiltinFunctions) NodeNext(node *goquery.Selection, args ...string) (out interface{}, err error) { - selector := "" - if len(args) > 0 { - selector = strings.TrimSpace(args[0]) - } - if selector != "" { - return node.NextFiltered(selector), nil - } - return node.Next(), nil -} - -// nodeParent() gets the parent elements of each element in the Selection. -// Filtered by the specified selector if selector not empty, -// It returns Selection object containing these elements for nested struct. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodeParent()"` -// } -func (builtin BuiltinFunctions) NodeParent(node *goquery.Selection, args ...string) (out interface{}, err error) { - selector := "" - if len(args) > 0 { - selector = strings.TrimSpace(args[0]) - } - if selector != "" { - return node.ParentFiltered(selector), nil - } - return node.Parent(), nil -} - -// nodePrev() gets the immediately preceding sibling of each element in the Selection. -// Filtered by the specified selector if selector not empty, -// It returns Selection object containing these elements for nested struct. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodePrev()"` -// } -func (builtin BuiltinFunctions) NodePrev(node *goquery.Selection, args ...string) (out interface{}, err error) { - selector := "" - if len(args) > 0 { - selector = strings.TrimSpace(args[0]) - } - if selector != "" { - return node.PrevFiltered(selector), nil - } - return node.Prev(), nil -} - -// nodeSiblings() gets the siblings of each element in the Selection. -// Filtered by the specified selector if selector not empty, -// It returns Selection object containing these elements for nested struct. -// struct { -// SubStruct struct { -// Example string `pagser:".selector->text()"` -// } `pagser:".selector->nodeSiblings()"` -// } -func (builtin BuiltinFunctions) NodeSiblings(node *goquery.Selection, args ...string) (out interface{}, err error) { - selector := "" - if len(args) > 0 { - selector = strings.TrimSpace(args[0]) - } - if selector != "" { - return node.SiblingsFiltered(selector), nil - } - return node.Siblings(), nil -} - // outerHtml() get element outer html, return string. // struct { // Example string `pagser:".selector->outerHtml()"` @@ -582,12 +416,3 @@ func (builtin BuiltinFunctions) TextSplit(node *goquery.Selection, args ...strin } return list, nil } - -// RegisterFunc register function for parse result -// pagser.RegisterFunc("MyFunc", func(node *goquery.Selection, args ...string) (out interface{}, err error) { -// //Todo -// return "Hello", nil -// }) -func (p *Pagser) RegisterFunc(name string, fn CallFunc) { - p.ctxFuncs[name] = fn -} diff --git a/builtin_selections.go b/builtin_selections.go new file mode 100644 index 0000000..69ea20c --- /dev/null +++ b/builtin_selections.go @@ -0,0 +1,189 @@ +package pagser + +import ( + "fmt" + "github.com/PuerkitoBio/goquery" + "strconv" + "strings" +) + +// Builtin functions are registered with a lowercase initial, eg: Text -> text() +type BuiltinSelections struct { +} + +// child(selector='') gets the child elements of each element in the Selection, +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct.. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->child()"` +// } +func (builtin BuiltinSelections) Child(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.ChildrenFiltered(selector), nil + } + return node.Children(), nil +} + +// eq(index) reduces the set of matched elements to the one at the specified index. +// If a negative index is given, it counts backwards starting at the end of the set. +// It returns a Selection object for nested struct, and an empty Selection object if the +// index is invalid. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->eq(0)"` +// } +func (builtin BuiltinSelections) Eq(node *goquery.Selection, args ...string) (out interface{}, err error) { + if len(args) < 1 { + return "", fmt.Errorf("nodeEq(index) must has `index` value") + } + indexValue := strings.TrimSpace(args[0]) + idx, err := strconv.Atoi(indexValue) + if err != nil { + return "", fmt.Errorf("index=`" + indexValue + "` is not number: " + err.Error()) + } + return node.Eq(idx), nil +} + +// first() First reduces the set of matched elements to the first in the set. +// It returns a new Selection object, and an empty Selection object if the +// the selection is empty. +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->first()"` +// } +func (builtin BuiltinSelections) First(node *goquery.Selection, args ...string) (out interface{}, err error) { + return node.First(), nil +} + +// last(selector='') reduces the set of matched elements to the last in the set. +// It returns a new Selection object, and an empty Selection object if +// the selection is empty. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->last()"` +// } +func (builtin BuiltinSelections) Last(node *goquery.Selection, args ...string) (out interface{}, err error) { + return node.Last(), nil +} + +// next(selector='') gets the immediately following sibling of each element in the Selection. +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->next()"` +// } +func (builtin BuiltinSelections) Next(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.NextFiltered(selector), nil + } + return node.Next(), nil +} + +// parent(selector='') gets the parent elements of each element in the Selection. +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->parent()"` +// } +func (builtin BuiltinSelections) Parent(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.ParentFiltered(selector), nil + } + return node.Parent(), nil +} + +// parents(selector='') gets the parent elements of each element in the Selection. +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->parents()"` +// } +func (builtin BuiltinSelections) Parents(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.ParentsFiltered(selector), nil + } + return node.Parents(), nil +} + +// parentsUntil(selector) gets the ancestors of each element in the Selection, up to but +// not including the element matched by the selector. It returns a new Selection +// object containing the matched elements. +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->parentsUntil('.wrap')"` +// } +func (builtin BuiltinSelections) ParentsUntil(node *goquery.Selection, args ...string) (out interface{}, err error) { + if len(args) < 1 { + return nil, fmt.Errorf("parentsUntil must has selector") + } + selector := strings.TrimSpace(args[0]) + return node.ParentsUntil(selector), nil +} + +// prev() gets the immediately preceding sibling of each element in the Selection. +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->prev()"` +// } +func (builtin BuiltinSelections) Prev(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.PrevFiltered(selector), nil + } + return node.Prev(), nil +} + +// siblings() gets the siblings of each element in the Selection. +// Filtered by the specified selector if selector not empty, +// It returns Selection object containing these elements for nested struct. +// struct { +// SubStruct struct { +// Example string `pagser:".selector->text()"` +// } `pagser:".selector->siblings()"` +// } +func (builtin BuiltinSelections) Siblings(node *goquery.Selection, args ...string) (out interface{}, err error) { + selector := "" + if len(args) > 0 { + selector = strings.TrimSpace(args[0]) + } + if selector != "" { + return node.SiblingsFiltered(selector), nil + } + return node.Siblings(), nil +} diff --git a/parse_test.go b/parse_test.go index 2b54dc8..3d2da1c 100644 --- a/parse_test.go +++ b/parse_test.go @@ -80,21 +80,27 @@ type ParseData struct { NavList []struct { ID int `pagser:"->attrEmpty(id, -1)"` Link struct { - Name string `pagser:"->text()"` - Url string `pagser:"->attr(href)"` + Name string `pagser:"->text()"` + Url string `pagser:"->attr(href)"` + AbsUrl string `pagser:"->absHref('https://thisvar.com')"` } `pagser:"a"` LinkHtml string `pagser:"a->html()"` ParentFuncName string `pagser:"a->ParentFunc()"` } `pagser:".navlink li"` + NavFirst struct { + ID int `pagser:"->attrEmpty(id, 0)"` + Name string `pagser:"a->text()"` + Url string `pagser:"a->attr(href)"` + } `pagser:".navlink li->first()"` NavLast struct { ID int `pagser:"->attrEmpty(id, 0)"` Name string `pagser:"a->text()"` Url string `pagser:"a->attr(href)"` - } `pagser:".navlink li:last-child"` + } `pagser:".navlink li->last()"` SubStruct struct { Label string `pagser:"label"` Values []string `pagser:".item->eachAttr(value)"` - } `pagser:".group->nodeEq(0)"` + } `pagser:".group->eq(0)"` SubPtrStruct *struct { Label string `pagser:"label"` Values []string `pagser:".item->eachAttr(value)"` @@ -143,40 +149,49 @@ type ParseData struct { CastFloat64Array []float64 `pagser:".item[name='float']->eachAttrEmpty(value, 0)"` NodeChild []struct { Value string `pagser:"->text()"` - } `pagser:".group->nodeChild()"` + } `pagser:".group->child()"` NodeChildSelector []struct { Value string `pagser:"->text()"` - } `pagser:".group->nodeChild('h2')"` + } `pagser:".group->child('h2')"` NodeEqFirst struct { Value string `pagser:"h2->text()"` - } `pagser:".group->nodeEq(0)"` + } `pagser:".group->eq(0)"` NodeEqLast struct { Value string `pagser:"h2->text()"` - } `pagser:".group->nodeEq(-1)"` + } `pagser:".group->eq(-1)"` NodeEqPrev []struct { Value string `pagser:"->text()"` - } `pagser:".item:last-child->nodePrev()"` + } `pagser:".item:last-child->prev()"` NodeEqPrevSelector struct { Value string `pagser:"->text()"` - } `pagser:".item:last-child->nodePrev('[id=\"1\"]')"` + } `pagser:".item:last-child->prev('[id=\"1\"]')"` NodeEqNext []struct { Value string `pagser:"->text()"` - } `pagser:".item:first-child->nodeNext()"` + } `pagser:".item:first-child->next()"` NodeEqNextSelector struct { Value string `pagser:"->text()"` - } `pagser:".item:first-child->nodeNext('[id=\"2\"]')"` + } `pagser:".item:first-child->next('[id=\"2\"]')"` NodeParent []struct { Value string `pagser:"h2->text()"` - } `pagser:"h2:first-child->nodeParent()"` + } `pagser:"h2:first-child->parent()"` + NodeParents []struct { + Value string `pagser:"h2->text()"` + } `pagser:"h2:first-child->parents()"` + NodeParentsSelector []struct { + Value string `pagser:"h2->text()"` + } `pagser:"h2:first-child->parents('[id=\"b\"]')"` + NodeParentsUntil []struct { + Value string `pagser:"h2->text()"` + } `pagser:"h2:first-child->parentsUntil('[id=\"b\"]')"` NodeParentSelector []struct { Value string `pagser:"h2->text()"` - } `pagser:"h2:first-child->nodeParent('[id=\"a\"]')"` + } `pagser:"h2:first-child->parent('[id=\"a\"]')"` NodeEqSiblings []struct { Value string `pagser:"->text()"` - } `pagser:".item:first-child->nodeSiblings()"` + } `pagser:".item:first-child->siblings()"` NodeEqSiblingsSelector []struct { Value string `pagser:"->text()"` - } `pagser:".item:first-child->nodeSiblings('[id=\"2\"]')"` + } `pagser:".item:first-child->siblings('[id=\"2\"]')"` } // this method will auto call, not need register.