Peng Yan
Practice Go: Web Crawler
Task: web crawler with go
Here is link I try to ask for help from community, Luckily Golang community have tons of terrific people to help.
As we can see, to make goroutine is safely quit is tricky, and final version is still error-prone.
In next section, will take look at context to writing easy code w/o much thinking.
main.go: Final version: with less goroutine generated, less time consume
func main() {
start := time.Now()
c := utility.New()
c.ParsingPath = "https://ryanfakir.github.io"
linkList := make(chan Links)
out := make(chan Links)
depth := 3
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
linkList <- Links{1, c.ParsingPath}
}()
for i := 0; i < 2; i++ {
go func() {
for v := range linkList {
fmt.Printf(" runtime.NumGoroutine()= %+v\n", runtime.NumGoroutine())
if v.Level >= depth {
// resolve work
wg.Done()
continue
}
res := c.Parser(v.Link)
go func(v Links, res []string) {
// resolve work for previous-round link
defer wg.Done()
for _, item := range res {
out <- Links{v.Level + 1, item}
}
}(v, res)
}
}()
}
go func() {
// wait all work done and quit job
wg.Wait()
close(linkList)
close(out)
}()
m := make(map[string]bool)
for link := range out {
if !m[link.Link] && link.Level <= depth {
fmt.Printf("time.Since(start) = %+v\n", time.Since(start))
// How many link been processed
// Next-around links need to be registered in wg
wg.Add(1)
m[link.Link] = true
linkList <- link
}
}
fmt.Println("done")
}
Second way to go unluckily not figure out right way to quit
func main() {
start := time.Now()
c := utility.New()
c.ParsingPath = "https://ryanfakir.github.io"
linkList := make(chan Links)
out := make(chan Links)
depth := 3
go func() {
// start first link
linkList <- Links{1, c.ParsingPath}
tokens <- struct{}{}
}()
var res []Links
for i := 0; i < 2; i++ {
go func() {
for v := range linkList {
fmt.Printf(" runtime.NumGoroutine()= %+v\n", runtime.NumGoroutine())
if v.Level >= depth {
continue
}
select {
case <-tokens:
// append untouch urls from res
var items []Links
for _, unTouchedLinks := range res {
for _, url := range c.Parser(unTouchedLinks.Link) {
items = append(items, Links{unTouchedLinks.Level + 1, url})
}
}
for _, url := range c.Parser(v.Link) {
items = append(items, Links{v.Level + 1, url})
}
res = append(res, items...)
// go func(v Links, res []string) {
for _, item := range res {
out <- item
}
default:
res = append(res, v)
}
//}(v, res)
}
}()
}
m := make(map[string]bool)
var i int
for link := range out {
if !m[link.Link] && link.Level <= depth {
i++
fmt.Printf("time.Since(start) = %+v\n", time.Since(start))
fmt.Printf("i = %+v\n", i)
m[link.Link] = true
go func() {
tokens <- struct{}{}
linkList <- link
}()
}
}
}
utility.go
var fvalue = DepthFlag{3}
type Client struct {
Url *url.URL
Fvalue *DepthFlag
ParsingPath string
}
func New() *Client {
return &Client{}
}
func (c *Client) Parser(url string) []string {
resp, err := http.Get(url)
if err != nil {
fmt.Errorf("http get %s with error %v", url, err)
return nil
}
if resp.StatusCode != http.StatusOK {
fmt.Errorf("http get status not OK: %s with %v", url, err)
return nil
}
defer resp.Body.Close()
doc, err := html.Parse(resp.Body)
if err != nil {
fmt.Errorf("parse problem %s with error %v", url, err)
return nil
}
c.Url = resp.Request.URL
pre := c.findLink
res := removeDuplicate(forEachNode(doc, pre, nil), url)
return res
}
func removeDuplicate(elements []string, url string) (res []string) {
m := make(map[string]bool)
for _, v := range elements {
v = strings.TrimSuffix(v, "/")
m[v] = true
}
for k, _ := range m {
if url != k {
res = append(res, k)
}
}
return
}
type DepthFlag struct {
Value int
}
func (d *DepthFlag) String() string {
return strconv.Itoa(d.Value)
}
func (d *DepthFlag) Set(s string) error {
var value int
_, err := fmt.Sscanf(s, "%d", &value)
if err != nil {
fmt.Errorf("problem with flag %s", s)
return err
}
d.Value = value
return nil
}
func init() {
flag.Var(&fvalue, "depth", "Depth Parameter")
}
func (c *Client) findLink(n *html.Node) (res []string) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, v := range n.Attr {
if v.Key != "href" {
continue
}
temp, err := c.Url.Parse(v.Val)
if err != nil {
continue
}
res = append(res, temp.String())
}
}
return
}
func forEachNode(n *html.Node, pre, post func(n *html.Node) []string) (res []string) {
if pre != nil {
res = append(res, pre(n)...)
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
res = append(res, forEachNode(c, pre, post)...)
}
if post != nil {
res = append(res, post(n)...)
}
return
}