patternpythonMinor
Quoted string parser
Viewed 0 times
parserquotedstring
Problem
I have written a string parser that is designed to split a string by spaces, excluding spaces wrapped in strings.
Here is some example inputs and outputs:
And the rules my parser follows:
-
The input string is split by spaces, counting any text between quotes (either single or double) as a single "unit" of text (i.e. not split by spaces).
-
Any text wrapped in double quotes can be escaped using a backslash. For example:
-
Any trailing backslashes are stripped.
Here is the code I have written to parse the strings:
`import re as _re
import enum as _enum
def is_space(text):
"""Returns whether or not the provided string is a single space."""
return _re.match(r'^\s*$', text) is not None
class State(_enum.Enum):
"""A state for the parser.
space: The last character was a space.
word: The last character was part of a word.
quote: The last character was part of a quoted string.
"""
space = 0
word = 1
quote = 2
class ParserState(object):
"""A string parser that splits a string into words.
This parser splits a string into words, counting quoted strings as a single
word.
Example:
input: 'hello world "inner string"'
output: ['hello', 'world', '"inner string"']
"""
def __init__(self, text):
"""Constructs a
Here is some example inputs and outputs:
| Rule | Input | Output |
|------|-------------------------------|-----------------------------------|
| 1 | 'foo' | ['foo'] |
| 1 | 'foo bar' | ['foo', 'bar'] |
| 1 | 'foo bar \'abc xyz\'' | ['foo', 'bar', "'abc xyz'"] |
| 1 | 'foo bar "abc xyz"' | ['foo', 'bar', '"abc xyz"'] |
| 3 | 'foo bar "abc xyz"\\' | ['foo', 'bar', '"abc xyz"'] |
| 2 | 'foo bar "abc \\"def\\" xyz"' | ['foo', 'bar', '"abc "def" xyz"'] |And the rules my parser follows:
-
The input string is split by spaces, counting any text between quotes (either single or double) as a single "unit" of text (i.e. not split by spaces).
-
Any text wrapped in double quotes can be escaped using a backslash. For example:
'hello "\\"world\\""' becomes ['hello', '""world""'].-
Any trailing backslashes are stripped.
Here is the code I have written to parse the strings:
`import re as _re
import enum as _enum
def is_space(text):
"""Returns whether or not the provided string is a single space."""
return _re.match(r'^\s*$', text) is not None
class State(_enum.Enum):
"""A state for the parser.
space: The last character was a space.
word: The last character was part of a word.
quote: The last character was part of a quoted string.
"""
space = 0
word = 1
quote = 2
class ParserState(object):
"""A string parser that splits a string into words.
This parser splits a string into words, counting quoted strings as a single
word.
Example:
input: 'hello world "inner string"'
output: ['hello', 'world', '"inner string"']
"""
def __init__(self, text):
"""Constructs a
Solution
What benefit is your
The only property that I think, ok that's ok, is the
You also want to reset the
And so if you had to keep it I'd use the following:
However this has no advantage over merging it with parse and keeping it a single function.
In fact it hinders the readability.
And so I'd merge it together to get:
For you enum I'd use a method that actually works in all 2.7 versions, rather than one that only works in the latest.
I used one of the methods in the top answer, but it shouldn't matter too much.
Just keep in mind that I use uppercase variables, as they are constants.
I'd also use
And so for all bar the function I use:
Your function can still be improved.
The changes you made from the original are:
The latter is half a good idea, but neither is too great.
The first moves the code out of the state sections, for no good reason.
The second removes the checks.
And so I'd move back to a 1:1 translation of the Go code.
However rather than using the while loop method, I'd use the iterator way.
Rather than your current way of:
The former is much more succinct.
And so you'd want to re-try from the conversion of the Go code in this form:
```
def parse(text):
words = []
word = []
state = State.SPACE
quote = ''
allow_blank = False
indexes = iter(range(len(text) + 1))
for index in indexes:
if index != len(text):
char = text[index]
if state is State.SPACE:
if index == len(rest):
break
if char in space:
continue
state = State.WORD
if (state is State.WORD or state is State.QUOTE) and index == len(text):
if allow_blank or word:
words.append(''.join(word))
break
if state is State.WORD:
if char in space:
state = State.SPACE
if allow_blank or word:
words.append(''.join(word))
word = []
allow_blank = False
continue
if char in '\'"':
quote = char
allow_blank = True
state = State.QUOTE
if char == TOKEN_ESCAPE:
if pos + 1 == len(te
ParserState class?The only property that I think, ok that's ok, is the
character one, all the others are just noise and can go.You also want to reset the
word whenever you use push_word and when you append or push_character you just want to push the character.And so if you had to keep it I'd use the following:
class ParserState(object):
def __init__(self, text):
self.text = text
self.index = 0
self.state = State.space
self.quote = ''
self.word = ''
self.words = []
@property
def character(self):
return self._text[self.index]
def push_word(self, allow_empty=False):
if allow_empty or self.word:
self.words.append(self.word)
self.word = ''
def append(self):
self.word += self.characterHowever this has no advantage over merging it with parse and keeping it a single function.
In fact it hinders the readability.
And so I'd merge it together to get:
def parse(text):
text = text
index = 0
state = State.space
quote = ''
word = ''
words = []
while index < len(text):
character = text[index]
if is_space(character):
if state == State.word:
if clear_word or word:
words.append(word)
if state != State.quote:
index += 1
continue
elif state == State.space:
state = State.word
if state == State.word:
if character in ('\'', '"'):
quote = character
state = State.quote
if character == '\\':
if index + 1 == len(text):
index += 1
continue
word += character
index += 1
word += character
index += 1
continue
if state == State.quote:
if character == quote:
state = State.word
if character == '\\' and quote != '\'':
if index + 1 == len(text):
index += 1
continue
index += 1
word += character
index += 1
words.append(word)
return wordsFor you enum I'd use a method that actually works in all 2.7 versions, rather than one that only works in the latest.
I used one of the methods in the top answer, but it shouldn't matter too much.
Just keep in mind that I use uppercase variables, as they are constants.
I'd also use
char in string.whitespace rather than using a regex, this is as it removes a function and is a simple in.And so for all bar the function I use:
from string import whitespace as space
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'Your function can still be improved.
The changes you made from the original are:
- Checks if whitespace first.
- Moved the addition of the final word out of the loop but removed it's checks.
The latter is half a good idea, but neither is too great.
The first moves the code out of the state sections, for no good reason.
The second removes the checks.
And so I'd move back to a 1:1 translation of the Go code.
However rather than using the while loop method, I'd use the iterator way.
indexes = iter(range(len(text)))
for index in indexes:
if (some test):
continue
if (some other test):
index = next(indexes)Rather than your current way of:
index = 0
while index < len(text):
if (some test):
index += 1
continue
if (some other test):
index += 1
index += 1The former is much more succinct.
And so you'd want to re-try from the conversion of the Go code in this form:
```
def parse(text):
words = []
word = []
state = State.SPACE
quote = ''
allow_blank = False
indexes = iter(range(len(text) + 1))
for index in indexes:
if index != len(text):
char = text[index]
if state is State.SPACE:
if index == len(rest):
break
if char in space:
continue
state = State.WORD
if (state is State.WORD or state is State.QUOTE) and index == len(text):
if allow_blank or word:
words.append(''.join(word))
break
if state is State.WORD:
if char in space:
state = State.SPACE
if allow_blank or word:
words.append(''.join(word))
word = []
allow_blank = False
continue
if char in '\'"':
quote = char
allow_blank = True
state = State.QUOTE
if char == TOKEN_ESCAPE:
if pos + 1 == len(te
Code Snippets
class ParserState(object):
def __init__(self, text):
self.text = text
self.index = 0
self.state = State.space
self.quote = ''
self.word = ''
self.words = []
@property
def character(self):
return self._text[self.index]
def push_word(self, allow_empty=False):
if allow_empty or self.word:
self.words.append(self.word)
self.word = ''
def append(self):
self.word += self.characterdef parse(text):
text = text
index = 0
state = State.space
quote = ''
word = ''
words = []
while index < len(text):
character = text[index]
if is_space(character):
if state == State.word:
if clear_word or word:
words.append(word)
if state != State.quote:
index += 1
continue
elif state == State.space:
state = State.word
if state == State.word:
if character in ('\'', '"'):
quote = character
state = State.quote
if character == '\\':
if index + 1 == len(text):
index += 1
continue
word += character
index += 1
word += character
index += 1
continue
if state == State.quote:
if character == quote:
state = State.word
if character == '\\' and quote != '\'':
if index + 1 == len(text):
index += 1
continue
index += 1
word += character
index += 1
words.append(word)
return wordsfrom string import whitespace as space
def enum(*sequential, **named):
enums = dict(zip(sequential, range(len(sequential))), **named)
return type('Enum', (), enums)
State = enum('SPACE', 'WORD', 'QUOTE')
TOKEN_ESCAPE = '\\'indexes = iter(range(len(text)))
for index in indexes:
if (some test):
continue
if (some other test):
index = next(indexes)index = 0
while index < len(text):
if (some test):
index += 1
continue
if (some other test):
index += 1
index += 1Context
StackExchange Code Review Q#131193, answer score: 2
Revisions (0)
No revisions yet.