patterngoMinor
Go Go Gadget Web Crawler
Viewed 0 times
crawlerwebgadget
Problem
In A Tour of Go, you are given the following problem:
In this exercise you'll use Go's concurrency features to parallelize a
web crawler.
Modify the
the same URL twice.
This is the code you're given
```
package main
import (
"fmt"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
Crawl(u, depth-1, fetcher)
}
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/
In this exercise you'll use Go's concurrency features to parallelize a
web crawler.
Modify the
Crawl function to fetch URLs in parallel without fetchingthe same URL twice.
This is the code you're given
```
package main
import (
"fmt"
)
type Fetcher interface {
// Fetch returns the body of URL and
// a slice of URLs found on that page.
Fetch(url string) (body string, urls []string, err error)
}
// Crawl uses fetcher to recursively crawl
// pages starting with url, to a maximum of depth.
func Crawl(url string, depth int, fetcher Fetcher) {
// TODO: Fetch URLs in parallel.
// TODO: Don't fetch the same URL twice.
// This implementation doesn't do either:
if depth <= 0 {
return
}
body, urls, err := fetcher.Fetch(url)
if err != nil {
fmt.Println(err)
return
}
fmt.Printf("found: %s %q\n", url, body)
for _, u := range urls {
Crawl(u, depth-1, fetcher)
}
return
}
func main() {
Crawl("http://golang.org/", 4, fetcher)
}
// fakeFetcher is Fetcher that returns canned results.
type fakeFetcher map[string]*fakeResult
type fakeResult struct {
body string
urls []string
}
func (f fakeFetcher) Fetch(url string) (string, []string, error) {
if res, ok := f[url]; ok {
return res.body, res.urls, nil
}
return "", nil, fmt.Errorf("not found: %s", url)
}
// fetcher is a populated fakeFetcher.
var fetcher = fakeFetcher{
"http://golang.org/": &fakeResult{
"The Go Programming Language",
[]string{
"http://golang.org/pkg/",
"http://golang.org/cmd/",
},
},
"http://golang.org/pkg/": &fakeResult{
"Packages",
[]string{
"http://golang.org/",
"http://golang.org/cmd/",
"http://golang.org/pkg/fmt/",
"http://golang.org/pkg/os/",
},
},
"http://golang.org/pkg/fmt/
Solution
-
Your map is of booleans so you can replace
-
It would be better to have
immediately once only rather than sprinkle
-
You seem to be using
You could just have a single wait group that keeps track of outstanding jobs. Your
-
Although having dedup check the depth is probably still a good idea, it's easy and trivially to do it in the crawl function and save some effort.
You code modified with the above points:
And runnable on the playground (by including the Go tour stuff):
http://play.golang.org/p/cuY3PZdQmI
(For comparison: your original code is also runnable on the playground.)
Your map is of booleans so you can replace
if _, ok := seen[job.url]; ok || bar with if seen[job.url] || bar. The zero value (which is what gets returned for non-existing map lookups) of bool is false.-
It would be better to have
Crawl do something likedefer func() {
job.done <- true
}()immediately once only rather than sprinkle
job.done
-
Your for loop in Crawl should either just require the channel is sufficiently buffered (as it currently is) and not bother with goroutines, or the whole for loop should be put into a single goroutine (and then you don't need any channel buffering).
-
for i := 0; i -
You seem to be using
Jobs.done like sync.WaitGroup.You could just have a single wait group that keeps track of outstanding jobs. Your
main would make sure to call wg.Add(1) before sending it's first job and before it then does wg.Wait(). Mostly everywhere you do `job.done -
Although having dedup check the depth is probably still a good idea, it's easy and trivially to do it in the crawl function and save some effort.
You code modified with the above points:
type Job struct {
url string
depth int
}
func dedup(fetcher Fetcher, ch chan Job, wg *sync.WaitGroup) {
seen := make(map[string]bool)
for job := range ch {
if seen[job.url] || job.depth <= 0 {
wg.Done()
continue
}
seen[job.url] = true
go Crawl(job, fetcher, ch, wg)
}
}
func Crawl(job Job, fetcher Fetcher, q chan Job, wg *sync.WaitGroup) {
defer wg.Done()
body, urls, err := fetcher.Fetch(job.url)
if err != nil {
log.Println("Crawl failure:", err) // TODO: something better with errors
return
}
log.Printf("Crawl: found %s\t%q\n", job.url, body)
if job.depth <= 1 {
return
}
wg.Add(len(urls))
for _, u := range urls {
q <- Job{u, job.depth - 1}
}
}
func main() {
wg := new(sync.WaitGroup)
wg.Add(1)
q := make(chan Job)
go dedup(fetcher, q, wg)
q <- Job{"http://golang.org/", 4}
wg.Wait()
// We close q here so that the dedup goroutine
// will finish. This isn't strictly needed here
// since we're main and everything gets stopped
// when we return; but that wouldn't be the case
// if we were some other function in a long lived
// program.
close(q)
}And runnable on the playground (by including the Go tour stuff):
http://play.golang.org/p/cuY3PZdQmI
(For comparison: your original code is also runnable on the playground.)
Code Snippets
defer func() {
job.done <- true
}()type Job struct {
url string
depth int
}
func dedup(fetcher Fetcher, ch chan Job, wg *sync.WaitGroup) {
seen := make(map[string]bool)
for job := range ch {
if seen[job.url] || job.depth <= 0 {
wg.Done()
continue
}
seen[job.url] = true
go Crawl(job, fetcher, ch, wg)
}
}
func Crawl(job Job, fetcher Fetcher, q chan Job, wg *sync.WaitGroup) {
defer wg.Done()
body, urls, err := fetcher.Fetch(job.url)
if err != nil {
log.Println("Crawl failure:", err) // TODO: something better with errors
return
}
log.Printf("Crawl: found %s\t%q\n", job.url, body)
if job.depth <= 1 {
return
}
wg.Add(len(urls))
for _, u := range urls {
q <- Job{u, job.depth - 1}
}
}
func main() {
wg := new(sync.WaitGroup)
wg.Add(1)
q := make(chan Job)
go dedup(fetcher, q, wg)
q <- Job{"http://golang.org/", 4}
wg.Wait()
// We close q here so that the dedup goroutine
// will finish. This isn't strictly needed here
// since we're main and everything gets stopped
// when we return; but that wouldn't be the case
// if we were some other function in a long lived
// program.
close(q)
}Context
StackExchange Code Review Q#70263, answer score: 5
Revisions (0)
No revisions yet.