HiveBrain v1.2.0
Get Started
← Back to all entries
patterncsharpMinor

My recursive parser is wet behind the ears (well, it's not DRY at least)

Submitted by: @import:stackexchange-codereview··
0
Viewed 0 times
theparserwetearsrecursivewelldrybehindleastnot

Problem

I have an issue opened on GitHub since mid-November, to refactor & DRY up the Rubberduck Parser module (the idea is to model the code in a VBA project or code module, so it can be browsed and inspected programmatically).

Yes, I've written this ........crap. I'm guilty of Ctrl+C/Ctrl+V coding here, of which I'm very unproud, but I'm not sure if it's because I messed up or because it's just the way it has to be (please, no!).

[ComVisible(false)]
public class Parser
{
    private readonly IEnumerable _grammar;

    public Parser(IEnumerable grammar)
    {
        _grammar = grammar;
    }

    public SyntaxTreeNode Parse(VBProject project)
    {
        var nodes = new List();
        try
        {
            var components = project.VBComponents.Cast().ToList();
            foreach (var component in components)
            {
                var lineCount = component.CodeModule.CountOfLines;
                if (lineCount 
    /// Converts VBA code into a .
    /// 
    /// The name of the VBA Project, used for scoping public nodes.
    /// The name of the module, used for scoping private nodes.
    /// The code to parse.
    /// 
    public SyntaxTreeNode Parse(string projectName, string componentName, string code, bool isClassModule)
    {
        var content = SplitLogicalCodeLines(projectName, componentName, code);
        var memberNodes = ParseModuleMembers(projectName, componentName, content).ToList();

        var result = new ModuleNode(projectName, componentName, memberNodes, isClassModule);
        return result;
    }


A bit of VBA theory here. In VBA, a code line can be "continued" using a "line continuation character", like this:

Dim foo As Integer, _
bar As String, _
baz As SomethingElse


The code module will give me 3 lines for the above, but it's really 1 "logical" code line, with one instruction that declares 3 variables. The SplitLogicalCodeLines method scans the VBA source code and returns a LogicalCodeLine for each, well

Solution

Extract methods, extract methods everywhere, PLEASE!
It's embarassing, after extracting the body of this code:

foreach (var instruction in instructions)
{
    var parsed = false;
    foreach (var syntax in grammar)
    {
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, localScope, instruction, out node))
        {
            continue;
        }

        if (node.HasChildNodes)
        {
            var childNode = node as CodeBlockNode;
            if (childNode != null)
            {
                node = ParseCodeBlock(publicScope, localScope, childNode, logicalCodeLines, ref currentIndex);
            }
        }

        result.AddNode(node);
        parsed = true;
        break;
    }

    if (!parsed)
    {
        result.AddNode(new ExpressionNode(instruction, localScope));
    }
}


which appears in the exactly same form in ParseProcedure and in ParseCodeBlock, I could, of course re-use it and make the code more readable and mantainable:

private bool ParseCodeBlockInstruction(string publicScope, string localScope, IEnumerable logicalLines,
            Instruction instruction, CodeBlockNode codeBlockNode, IEnumerable grammar, ref int index)
{
    var result = codeBlockNode;
    foreach (var syntax in grammar)
    {
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, localScope, instruction, out node))
        {
            continue;
        }

        var childNode = node as CodeBlockNode;
        if (childNode != null)
        {
            node = ParseCodeBlock(publicScope, localScope, childNode, logicalLines, ref index);
        }

        result.AddNode(node);
        return true;
    }
    return false;
}


After this you should also extract the correspondent bit of code in ParseModuleMembers. This time the reason is because the logic is a bit hard to follow up, and that parsed flag that appeared everywhere in those three methods doesn't really help:

private class ParsedSyntaxTreeNode
{
    public SyntaxTreeNode Node { get; set; }
    public bool Parsed { get; set; }
    public int Index { get; set; }
}

private IEnumerable ParseModuleInstruction(string publicScope, string localScope, IEnumerable lines,
    Instruction instruction, int index)
{
    foreach (var syntax in _grammar.Where(s => !s.IsChildNodeSyntax))
    {
        string currentLocalScope = localScope;
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, currentLocalScope, instruction, out node))
        {
            continue;
        }

        if (syntax.Type.HasFlag(SyntaxType.HasChildNodes))
        {
            var codeBlockNode = node as CodeBlockNode;
            if (codeBlockNode != null)
            {
                if (node is ProcedureNode)
                {
                    currentLocalScope = localScope + "." + (node as ProcedureNode).Identifier.Name;
                    var parsedProcNode = ParseProcedure(publicScope, currentLocalScope, node as ProcedureNode, lines, ref index);
                    yield return new ParsedSyntaxTreeNode()
                    {
                        Node = parsedProcNode,
                        Parsed = true,
                        Index = index
                    };
                    yield break;
                }

                var parsedCodeNode = ParseCodeBlock(publicScope, currentLocalScope, codeBlockNode, lines, ref index);
                yield return new ParsedSyntaxTreeNode()
                {
                    Node = parsedCodeNode,
                    Parsed = true,
                    Index = index
                };
                yield break;
            }
        }

        yield return new ParsedSyntaxTreeNode()
        {
            Node = node,
            Parsed = true,
            Index = index
        };
    }
    yield return new ParsedSyntaxTreeNode(){Index = index, Parsed = false};
}


I would also swap your while in the ParseCodeBlock by a for and get rid of of the increments that are made in the existing while body, and give more meaning to what we are doing: processing all lines.

for (++index; index  lines[indexCaptured].Content.Trim().StartsWith(marker)))
    {
        break;
    }
    var line = lines[index];
    if (string.IsNullOrWhiteSpace(line.Content))
    {
        continue;
    }

    var instructions = line.SplitInstructions();
    foreach (var instruction in instructions)
    {
        bool hadMatch = ParseCodeBlockInstruction(publicScope, localScope, logicalCodeLines, instruction, codeBlockNode, grammar, ref index);
        if (!hadMatch)
        {
            result.AddNode(new ExpressionNode(instruction, localScope));
        }
    }

    if (lines[index + 1].Content.Trim().StartsWith(ReservedKeywords.Else))
    {
        break;
    }
}
index = Math.Max(index, lines.Length);


Refactored code on pastebin.

Code Snippets

foreach (var instruction in instructions)
{
    var parsed = false;
    foreach (var syntax in grammar)
    {
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, localScope, instruction, out node))
        {
            continue;
        }

        if (node.HasChildNodes)
        {
            var childNode = node as CodeBlockNode;
            if (childNode != null)
            {
                node = ParseCodeBlock(publicScope, localScope, childNode, logicalCodeLines, ref currentIndex);
            }
        }

        result.AddNode(node);
        parsed = true;
        break;
    }

    if (!parsed)
    {
        result.AddNode(new ExpressionNode(instruction, localScope));
    }
}
private bool ParseCodeBlockInstruction(string publicScope, string localScope, IEnumerable<LogicalCodeLine> logicalLines,
            Instruction instruction, CodeBlockNode codeBlockNode, IEnumerable<ISyntax> grammar, ref int index)
{
    var result = codeBlockNode;
    foreach (var syntax in grammar)
    {
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, localScope, instruction, out node))
        {
            continue;
        }

        var childNode = node as CodeBlockNode;
        if (childNode != null)
        {
            node = ParseCodeBlock(publicScope, localScope, childNode, logicalLines, ref index);
        }

        result.AddNode(node);
        return true;
    }
    return false;
}
private class ParsedSyntaxTreeNode
{
    public SyntaxTreeNode Node { get; set; }
    public bool Parsed { get; set; }
    public int Index { get; set; }
}

private IEnumerable<ParsedSyntaxTreeNode> ParseModuleInstruction(string publicScope, string localScope, IEnumerable<LogicalCodeLine> lines,
    Instruction instruction, int index)
{
    foreach (var syntax in _grammar.Where(s => !s.IsChildNodeSyntax))
    {
        string currentLocalScope = localScope;
        SyntaxTreeNode node;
        if (!syntax.IsMatch(publicScope, currentLocalScope, instruction, out node))
        {
            continue;
        }

        if (syntax.Type.HasFlag(SyntaxType.HasChildNodes))
        {
            var codeBlockNode = node as CodeBlockNode;
            if (codeBlockNode != null)
            {
                if (node is ProcedureNode)
                {
                    currentLocalScope = localScope + "." + (node as ProcedureNode).Identifier.Name;
                    var parsedProcNode = ParseProcedure(publicScope, currentLocalScope, node as ProcedureNode, lines, ref index);
                    yield return new ParsedSyntaxTreeNode()
                    {
                        Node = parsedProcNode,
                        Parsed = true,
                        Index = index
                    };
                    yield break;
                }

                var parsedCodeNode = ParseCodeBlock(publicScope, currentLocalScope, codeBlockNode, lines, ref index);
                yield return new ParsedSyntaxTreeNode()
                {
                    Node = parsedCodeNode,
                    Parsed = true,
                    Index = index
                };
                yield break;
            }
        }

        yield return new ParsedSyntaxTreeNode()
        {
            Node = node,
            Parsed = true,
            Index = index
        };
    }
    yield return new ParsedSyntaxTreeNode(){Index = index, Parsed = false};
}
for (++index; index < lines.Length; ++index)
{
    int indexCaptured = index;
    if (result.EndOfBlockMarkers.Any(marker => lines[indexCaptured].Content.Trim().StartsWith(marker)))
    {
        break;
    }
    var line = lines[index];
    if (string.IsNullOrWhiteSpace(line.Content))
    {
        continue;
    }

    var instructions = line.SplitInstructions();
    foreach (var instruction in instructions)
    {
        bool hadMatch = ParseCodeBlockInstruction(publicScope, localScope, logicalCodeLines, instruction, codeBlockNode, grammar, ref index);
        if (!hadMatch)
        {
            result.AddNode(new ExpressionNode(instruction, localScope));
        }
    }

    if (lines[index + 1].Content.Trim().StartsWith(ReservedKeywords.Else))
    {
        break;
    }
}
index = Math.Max(index, lines.Length);

Context

StackExchange Code Review Q#74553, answer score: 6

Revisions (0)

No revisions yet.