Practice Go: Web Crawler
Dec 24, 2016
4 minutes read

Task: web crawler with go

Here is link I try to ask for help from community, Luckily Golang community have tons of terrific people to help.

As we can see, to make goroutine is safely quit is tricky, and final version is still error-prone.
In next section, will take look at context to writing easy code w/o much thinking.

main.go: Final version: with less goroutine generated, less time consume

func main() {
	start := time.Now()
	c := utility.New()
	c.ParsingPath = "https://ryanfakir.github.io"
	linkList := make(chan Links)
	out := make(chan Links)
	depth := 3
	wg := &sync.WaitGroup{}
	wg.Add(1)
	go func() {
		linkList <- Links{1, c.ParsingPath}
	}()
	for i := 0; i < 2; i++ {
		go func() {
			for v := range linkList {
				fmt.Printf(" runtime.NumGoroutine()= %+v\n", runtime.NumGoroutine())
				if v.Level >= depth {
					// resolve work 
					wg.Done()
					continue
				}
				res := c.Parser(v.Link)
				go func(v Links, res []string) {
					// resolve work for previous-round link
					defer wg.Done()
					for _, item := range res {
						out <- Links{v.Level + 1, item}
					}
				}(v, res)
			}
		}()
	}

	go func() {
		// wait all work done and quit job
		wg.Wait()
		close(linkList)
		close(out)
	}()
	m := make(map[string]bool)
	for link := range out {
		if !m[link.Link] && link.Level <= depth {
			fmt.Printf("time.Since(start) = %+v\n", time.Since(start))
			// How many link been processed
			// Next-around links need to be registered in wg
			wg.Add(1)
			m[link.Link] = true
			linkList <- link
		}
	}
	fmt.Println("done")
}

Second way to go unluckily not figure out right way to quit

func main() {
	start := time.Now()
	c := utility.New()
	c.ParsingPath = "https://ryanfakir.github.io"
	linkList := make(chan Links)
	out := make(chan Links)
	depth := 3
	go func() {
		// start first link
		linkList <- Links{1, c.ParsingPath}
		tokens <- struct{}{}
	}()
	var res []Links
	for i := 0; i < 2; i++ {
		go func() {
			for v := range linkList {
				fmt.Printf(" runtime.NumGoroutine()= %+v\n", runtime.NumGoroutine())
				if v.Level >= depth {
					continue
				}
				select {
				case <-tokens:
					// append untouch urls from res
					var items []Links
					for _, unTouchedLinks := range res {
						for _, url := range c.Parser(unTouchedLinks.Link) {
							items = append(items, Links{unTouchedLinks.Level + 1, url})
						}
					}
					for _, url := range c.Parser(v.Link) {
						items = append(items, Links{v.Level + 1, url})
					}
					res = append(res, items...)
					// go func(v Links, res []string) {
					for _, item := range res {
						out <- item
					}
				default:
					res = append(res, v)
				}
				//}(v, res)
			}
		}()
	}

	m := make(map[string]bool)
	var i int
	for link := range out {
		if !m[link.Link] && link.Level <= depth {
			i++
			fmt.Printf("time.Since(start) = %+v\n", time.Since(start))
			fmt.Printf("i = %+v\n", i)
			m[link.Link] = true
			go func() {
				tokens <- struct{}{}
				linkList <- link
			}()

		}
	}
}

utility.go

var fvalue = DepthFlag{3}

type Client struct {
	Url         *url.URL
	Fvalue      *DepthFlag
	ParsingPath string
}

func New() *Client {
	return &Client{}
}
func (c *Client) Parser(url string) []string {
	resp, err := http.Get(url)
	if err != nil {
		fmt.Errorf("http get %s with error %v", url, err)
		return nil
	}
	if resp.StatusCode != http.StatusOK {
		fmt.Errorf("http get status not OK: %s with %v", url, err)
		return nil
	}
	defer resp.Body.Close()
	doc, err := html.Parse(resp.Body)
	if err != nil {
		fmt.Errorf("parse problem %s with error %v", url, err)
		return nil
	}
	c.Url = resp.Request.URL
	pre := c.findLink
	res := removeDuplicate(forEachNode(doc, pre, nil), url)
	return res
}

func removeDuplicate(elements []string, url string) (res []string) {
	m := make(map[string]bool)
	for _, v := range elements {
		v = strings.TrimSuffix(v, "/")
		m[v] = true
	}
	for k, _ := range m {
		if url != k {
			res = append(res, k)
		}
	}
	return
}

type DepthFlag struct {
	Value int
}

func (d *DepthFlag) String() string {
	return strconv.Itoa(d.Value)
}
func (d *DepthFlag) Set(s string) error {
	var value int
	_, err := fmt.Sscanf(s, "%d", &value)
	if err != nil {
		fmt.Errorf("problem with flag %s", s)
		return err
	}
	d.Value = value
	return nil
}
func init() {
	flag.Var(&fvalue, "depth", "Depth Parameter")
}

func (c *Client) findLink(n *html.Node) (res []string) {
	if n.Type == html.ElementNode && n.Data == "a" {
		for _, v := range n.Attr {
			if v.Key != "href" {
				continue
			}
			temp, err := c.Url.Parse(v.Val)
			if err != nil {
				continue
			}
			res = append(res, temp.String())
		}
	}
	return
}
func forEachNode(n *html.Node, pre, post func(n *html.Node) []string) (res []string) {
	if pre != nil {
		res = append(res, pre(n)...)
	}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		res = append(res, forEachNode(c, pre, post)...)
	}
	if post != nil {
		res = append(res, post(n)...)
	}
	return
}

Back to posts