patterncsharpMinor
Parsing website info with a database and web service
Viewed 0 times
websiteinfowithdatabaseserviceparsingweband
Problem
I have to make project for school in C# which has to use database and web service. I have made a program which starts a web service and gets a function from there which is used for connecting to a website. I am using HTML Agility Pack for parsing info from that website. Website info is then written in textbox and stored in database.
Here is one function from project, and the rest is here.
Some variables are in Croatian, so don't get confused. I am looking for helpful reviews and tips on how I can make my code better.
Here is one function from project, and the rest is here.
public void HtmlParser()
{
Encoding enc = Encoding.UTF8;
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.Load(client.Request(), enc);
HtmlNode root = htmlDoc.DocumentNode;
HtmlNodeCollection jobNode = root.SelectNodes("//div[@class='jobBox']");
HtmlNodeCollection headerNode = root.SelectNodes("//div[@class='jobBox']/h1");
HtmlNodeCollection contentNode = root.SelectNodes("//div[@class='jobBox']/div[@class='content']");
HtmlNodeCollection linkNode = root.SelectNodes("//div[@class='jobBox']/a");
jobs = new List();
headers = new List();
content = new List();
links = new List();
full_links = new List();
foreach (HtmlNode node in jobNode)
{
jobs.Add(node.InnerText);
}
foreach (HtmlNode node in headerNode)
{
headers.Add(node.InnerText.Trim());
}
foreach (HtmlNode node in contentNode)
{
content.Add(node.InnerText.Trim());
}
foreach (HtmlNode node in linkNode)
{
links.Add(node.GetAttributeValue("href", null).Trim());
Uri temp = new Uri(link + node.GetAttributeValue("href", null).Trim());
full_links.Add(temp);
}
htmlDoc.Save("file.htm", enc);
StringParser();
}Some variables are in Croatian, so don't get confused. I am looking for helpful reviews and tips on how I can make my code better.
Solution
One comment I'd have is that there is a LOT going on in that method... but for the most part
each section is the same...
You could probably pull that out into a method to shorten your code.
each section is the same...
- GetNodeCollection
- Loop Through it
- Add it to a list
You could probably pull that out into a method to shorten your code.
public List ParseSection(HtmlNode root, string nodePath, bool isLink = false)
{
HtmlNodeCollection nodes = root.SelectNodes(nodePath);
List parsedNodes = new List();
foreach (HtmlNode node in nodes)
{
if(isLink)
{
parsedNodes.Add(node.GetAttributeValue("href", null).Trim());
}
else
{
parsedNodes.Add(node.InnerText.Trim());
}
}
return parsedNodes;
}
public void HtmlParser()
{
Encoding enc = Encoding.UTF8;
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.Load(client.Request(), enc);
HtmlNode root = htmlDoc.DocumentNode;
HtmlNodeCollection linkNode = root.SelectNodes();
List jobs = ParseSection(root, "//div[@class='jobBox']");
List headers = ParseSection(root, "//div[@class='jobBox']/h1");
List content = ParseSection(root, "//div[@class='jobBox']/div[@class='content']");
List links = ParseSection(root, "//div[@class='jobBox']/a", true); //TRUE
List fullLinks = links.Select(l => new Uri(l));
htmlDoc.Save("file.htm", enc);
StringParser();
}Code Snippets
public List<string> ParseSection(HtmlNode root, string nodePath, bool isLink = false)
{
HtmlNodeCollection nodes = root.SelectNodes(nodePath);
List<string> parsedNodes = new List<string>();
foreach (HtmlNode node in nodes)
{
if(isLink)
{
parsedNodes.Add(node.GetAttributeValue("href", null).Trim());
}
else
{
parsedNodes.Add(node.InnerText.Trim());
}
}
return parsedNodes;
}
public void HtmlParser()
{
Encoding enc = Encoding.UTF8;
HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
htmlDoc.Load(client.Request(), enc);
HtmlNode root = htmlDoc.DocumentNode;
HtmlNodeCollection linkNode = root.SelectNodes();
List<string> jobs = ParseSection(root, "//div[@class='jobBox']");
List<string> headers = ParseSection(root, "//div[@class='jobBox']/h1");
List<string> content = ParseSection(root, "//div[@class='jobBox']/div[@class='content']");
List<string> links = ParseSection(root, "//div[@class='jobBox']/a", true); //TRUE
List<Uri> fullLinks = links.Select(l => new Uri(l));
htmlDoc.Save("file.htm", enc);
StringParser();
}Context
StackExchange Code Review Q#26787, answer score: 5
Revisions (0)
No revisions yet.