-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinks.go
122 lines (98 loc) · 1.96 KB
/
links.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package main
import (
"bytes"
"errors"
"html"
"io"
"io/ioutil"
"log"
"net/url"
"path"
"strings"
"github.com/PuerkitoBio/goquery"
"mvdan.cc/xurls/v2"
)
type Link struct {
NodeName string
Text string
URL string
}
func NormalizeString(s string) string {
unescaped := html.UnescapeString(s)
return strings.Join(strings.Fields(unescaped), " ")
}
func ImageText(s *goquery.Selection) string {
alt, _ := s.Attr("alt")
if alt != "" {
return alt
}
title, _ := s.Attr("title")
if title != "" {
return title
}
src, ok := s.Attr("src")
if !ok || src == "" {
return ""
}
imageURL, err := url.Parse(src)
if err != nil {
log.Fatal(err)
}
return path.Base(imageURL.Path)
}
func FindLinksHTML(file io.Reader) ([]Link, error) {
doc, err := goquery.NewDocumentFromReader(file)
if err != nil {
return nil, err
}
var links []Link
doc.Find("a").Each(func(i int, s *goquery.Selection) {
url, ok := s.Attr("href")
if !ok {
return
}
node := goquery.NodeName(s)
text := s.Text()
s.Children().Each(func(i int, s *goquery.Selection) {
node = goquery.NodeName(s)
if node == "img" {
text = ImageText(s)
return
}
})
text = NormalizeString(text)
if text == "" {
text = url
}
links = append(links, Link{node, text, url})
})
if len(links) == 0 {
return nil, errors.New("no links found")
}
return links, nil
}
func FindLinksRegEx(file []byte) ([]Link, error) {
var links []Link
rxStrict := xurls.Strict()
regexLinks := rxStrict.FindAllString(string(file), -1)
for _, link := range regexLinks {
links = append(links, Link{"", link, link})
}
if len(links) == 0 {
return nil, errors.New("no links found")
}
return links, nil
}
func FindLinks(file io.Reader) ([]Link, error) {
var buf bytes.Buffer
tee := io.TeeReader(file, &buf)
links, err := FindLinksHTML(tee)
if err != nil {
b, err := ioutil.ReadAll(&buf)
if err != nil {
return nil, err
}
return FindLinksRegEx(b)
}
return links, err
}