-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.go
76 lines (69 loc) · 1.47 KB
/
crawler.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
// Before using, please make sure you have
// golang.org/x/net/html package installed
// If not, run 'go get golang.org/x/net/html'
package main
import (
"./links"
"flag"
"fmt"
"log"
)
var (
startLink = flag.String("start", "https://google.com", "starting link")
maxWorker = flag.Int("n", 10, "maximum amount of concurrent workers")
maxDepth = flag.Int("d", 1, "maximum depth of crawling")
)
func crawl(url string) []string {
fmt.Println(url)
list, err := links.Extract(url)
if err != nil {
log.Print(err)
}
return list
}
func main() {
flag.Parse()
mWorker := *maxWorker
mDepth := *maxDepth
startLinks := []string{*startLink}
worklist := make(chan []string)
unseenLinks := make(chan string)
go func() { worklist <- startLinks }()
for i := 0; i < mWorker; i++ {
go func() {
for link := range unseenLinks {
foundLinks := crawl(link)
go func() { worklist <- foundLinks }()
}
}()
}
seen := make(map[string]bool)
linkNOld := 0
linkNNew := len(startLinks)
for depth := 0; depth <= mDepth+1; depth++ {
linkNOld, linkNNew = linkNNew, 0
linkList := make([]string, 0)
for list := range worklist {
linkNOld--
if depth == mDepth+1 {
if linkNOld <= 0 {
break
}
continue
}
for _, link := range list {
if !seen[link] {
seen[link] = true
linkList = append(linkList, link)
linkNNew++
}
}
if linkNOld <= 0 {
for _, newLink := range linkList {
unseenLinks <- newLink
}
break
}
}
}
}