patterncsharpMinor

Parsing website info with a database and web service

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

websiteinfowithdatabaseserviceparsingweband

Problem

I have to make project for school in C# which has to use database and web service. I have made a program which starts a web service and gets a function from there which is used for connecting to a website. I am using HTML Agility Pack for parsing info from that website. Website info is then written in textbox and stored in database.

Here is one function from project, and the rest is here.

public void HtmlParser()
    {
        Encoding enc = Encoding.UTF8;

        HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();

        htmlDoc.Load(client.Request(), enc);

        HtmlNode root = htmlDoc.DocumentNode;
        HtmlNodeCollection jobNode = root.SelectNodes("//div[@class='jobBox']");
        HtmlNodeCollection headerNode = root.SelectNodes("//div[@class='jobBox']/h1");
        HtmlNodeCollection contentNode = root.SelectNodes("//div[@class='jobBox']/div[@class='content']");
        HtmlNodeCollection linkNode = root.SelectNodes("//div[@class='jobBox']/a");

        jobs = new List();
        headers = new List();
        content = new List();
        links = new List();
        full_links = new List();

        foreach (HtmlNode node in jobNode)
        {
            jobs.Add(node.InnerText);
        }
        foreach (HtmlNode node in headerNode)
        {
            headers.Add(node.InnerText.Trim());
        }
        foreach (HtmlNode node in contentNode)
        {
            content.Add(node.InnerText.Trim());
        }

        foreach (HtmlNode node in linkNode)
        {
            links.Add(node.GetAttributeValue("href", null).Trim());
            Uri temp = new Uri(link + node.GetAttributeValue("href", null).Trim());
            full_links.Add(temp);
        }

        htmlDoc.Save("file.htm", enc);

        StringParser();
    }

Some variables are in Croatian, so don't get confused. I am looking for helpful reviews and tips on how I can make my code better.

Solution

One comment I'd have is that there is a LOT going on in that method... but for the most part
each section is the same...

GetNodeCollection

Loop Through it

Add it to a list

You could probably pull that out into a method to shorten your code.

public List ParseSection(HtmlNode root, string nodePath, bool isLink = false)
{
    HtmlNodeCollection nodes = root.SelectNodes(nodePath);
    List parsedNodes = new List();
    foreach (HtmlNode node in nodes)
    {
        if(isLink)
        {
            parsedNodes.Add(node.GetAttributeValue("href", null).Trim());
        }
        else
        {
            parsedNodes.Add(node.InnerText.Trim());
        }
    }
    return parsedNodes;
}

public void HtmlParser()
{
    Encoding enc = Encoding.UTF8;
    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
    htmlDoc.Load(client.Request(), enc);
    HtmlNode root = htmlDoc.DocumentNode;
    HtmlNodeCollection linkNode = root.SelectNodes();

    List jobs = ParseSection(root, "//div[@class='jobBox']");
    List headers = ParseSection(root, "//div[@class='jobBox']/h1");
    List content = ParseSection(root, "//div[@class='jobBox']/div[@class='content']");
    List links = ParseSection(root, "//div[@class='jobBox']/a", true); //TRUE
    List fullLinks = links.Select(l => new Uri(l));

    htmlDoc.Save("file.htm", enc);
    StringParser();
}

Code Snippets

public List<string> ParseSection(HtmlNode root, string nodePath, bool isLink = false)
{
    HtmlNodeCollection nodes = root.SelectNodes(nodePath);
    List<string> parsedNodes = new List<string>();
    foreach (HtmlNode node in nodes)
    {
        if(isLink)
        {
            parsedNodes.Add(node.GetAttributeValue("href", null).Trim());
        }
        else
        {
            parsedNodes.Add(node.InnerText.Trim());
        }
    }
    return parsedNodes;
}

public void HtmlParser()
{
    Encoding enc = Encoding.UTF8;
    HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument();
    htmlDoc.Load(client.Request(), enc);
    HtmlNode root = htmlDoc.DocumentNode;
    HtmlNodeCollection linkNode = root.SelectNodes();

    List<string> jobs = ParseSection(root, "//div[@class='jobBox']");
    List<string> headers = ParseSection(root, "//div[@class='jobBox']/h1");
    List<string> content = ParseSection(root, "//div[@class='jobBox']/div[@class='content']");
    List<string> links = ParseSection(root, "//div[@class='jobBox']/a", true); //TRUE
    List<Uri> fullLinks = links.Select(l => new Uri(l));

    htmlDoc.Save("file.htm", enc);
    StringParser();
}

Context

StackExchange Code Review Q#26787, answer score: 5

Revisions (0)

No revisions yet.