patterncsharpMinor
Page Scraper and DOM manipulator
Viewed 0 times
manipulatordompageandscraper
Problem
This code is a page scraper using HtmlAgilityPack that creates a DOM document upon construction and allows for node manipulation afterward.
HtmlAgilityPack uses XPath Selectors for selecting nodes.
An example use case would look like:
This is the
The reason I pass in an
```
using System;
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace Scraper
{
public class PageScraper
{
private readonly HtmlDocument _document;
public Action Print { get; set; }
public PageScraper(string url, Action print)
{
if (string.IsNullOrEmpty(url))
{
throw new Exception("url is empty");
}
Print = print;
WebClient webClient = new WebClient();
Print("downloading page");
string urlWithoutNewLineCharacters = StripNewLineCharactersAndSpaces(url);
string html = webClient.DownloadString(urlWithoutNewLineCharacters);
_document = new HtmlDocument();
_document.LoadHtml(html);
Print("loading page");
}
public HtmlNodeCollection FetchLinks()
{
Print("fetching links");
return _document.DocumentNode.SelectNodes("//a[@href]");
}
public HtmlNodeCollection FetchNodes(string selector)
{
Print("fetching nodes");
return _document.DocumentNode.SelectNodes(selector);
}
public HtmlNode FetchNode(string selector)
{
Print("fetching node");
return _document.DocumentNode.SelectSingle
HtmlAgilityPack uses XPath Selectors for selecting nodes.
An example use case would look like:
var me = new PageScraper("http://codereview.stackexchange.com/users/62429/quill", Console.WriteLine);
var usernameNode = me.FetchNode("//h2[@class=user-card-name]");
var username = PageScraper.FetchNodeText(usernameNode);This is the
PageScraper class, the only class in the Scraper namespace so far.The reason I pass in an
Action is so that I can attach log messages to a Forms list box.```
using System;
using System.Net;
using System.Text.RegularExpressions;
using HtmlAgilityPack;
namespace Scraper
{
public class PageScraper
{
private readonly HtmlDocument _document;
public Action Print { get; set; }
public PageScraper(string url, Action print)
{
if (string.IsNullOrEmpty(url))
{
throw new Exception("url is empty");
}
Print = print;
WebClient webClient = new WebClient();
Print("downloading page");
string urlWithoutNewLineCharacters = StripNewLineCharactersAndSpaces(url);
string html = webClient.DownloadString(urlWithoutNewLineCharacters);
_document = new HtmlDocument();
_document.LoadHtml(html);
Print("loading page");
}
public HtmlNodeCollection FetchLinks()
{
Print("fetching links");
return _document.DocumentNode.SelectNodes("//a[@href]");
}
public HtmlNodeCollection FetchNodes(string selector)
{
Print("fetching nodes");
return _document.DocumentNode.SelectNodes(selector);
}
public HtmlNode FetchNode(string selector)
{
Print("fetching node");
return _document.DocumentNode.SelectSingle
Solution
Print = print;
WebClient webClient = new WebClient();
Print("downloading page");
string urlWithoutNewLineCharacters = StripNewLineCharactersAndSpaces(url);
string html = webClient.DownloadString(urlWithoutNewLineCharacters);
_document = new HtmlDocument();
_document.LoadHtml(html);
Print("loading page");You are doing too much in your constructor. Consider extracting some of this code into separate method(s) maybe an
intitialize() method or buildDocument() method. The rule for this is know as Coding at Wrong Level of Abstraction. Your method
StripNewLineCharactersAndSpaces should be called right after the first time it is used not at the end of your class. this is known as Vertical Separation This allows reads to quickly ready your code without having to bounce around too much. I do Like that you have kept your methods really small
Code Snippets
Print = print;
WebClient webClient = new WebClient();
Print("downloading page");
string urlWithoutNewLineCharacters = StripNewLineCharactersAndSpaces(url);
string html = webClient.DownloadString(urlWithoutNewLineCharacters);
_document = new HtmlDocument();
_document.LoadHtml(html);
Print("loading page");Context
StackExchange Code Review Q#117112, answer score: 4
Revisions (0)
No revisions yet.