patternpythonMinor
Remove comments from C-like source code
Viewed 0 times
sourcelikeremovecodefromcomments
Problem
I am working on the problem of removing comments from C-like source code. Here is my code in Python 2.7, and if anyone could advise improvement areas (especially performance), or any functional bugs which I do not discover, it will be great.
Problem statement
Given a file path represented as string, take this input string and remove all the comments in the file, print this file or save this to a new txt file by your choice.
Cases to consider:
Source code
Problem statement
Given a file path represented as string, take this input string and remove all the comments in the file, print this file or save this to a new txt file by your choice.
Cases to consider:
// comment
/*
comment
*/
foo(); // commentSource code
code='''// comment
/*
/* hello python */
comment
*/
foo(); // comment
'''
def remove_comment(content):
index = 0
comment_line_inside = False
comment_block_level = 0
result = []
while index < len(content):
if content[index] == '/' and index + 1 < len(content) and content[index+1] == '*':
comment_block_level += 1
elif content[index] == '/' and content[index-1] == '*':
comment_block_level -= 1
elif content[index] == '/' and index + 1 < len(content) and content[index + 1] == '/':
comment_line_inside = True
elif content[index] == '\n' and comment_line_inside == True:
comment_line_inside = False
elif not comment_line_inside and comment_block_level == 0:
result.append(content[index])
index += 1
return ''.join(result)
if __name__ == "__main__":
print remove_comment(code)Solution
First off, as said in the comments, C-style comments matches the first
should be interpreted as:
It is also more natural, in Python, to iterate over the elements of a collection rather than their indices. This allows you to write
You can also use temporary variables to store characters that may indicate the beginning or the end of a comment without having to look at the character before or after the current one:
You can also simplify a bit the memory management by using a generator instead of appending into a list:
If you want to go crazy, you can also use a state machine approach to simplify the code: no more boolean flags and far less comparisons in average:
But, all in all, this is far too complicated for the task at hand. You can get the same job done using a simple regular expression:
/ with the first /; meaning you can not nest comments:/* This comment is /* a nested */ comment */should be interpreted as:
comment */It is also more natural, in Python, to iterate over the elements of a collection rather than their indices. This allows you to write
for character in content:. And if you trully need indices, you can use enumerate.You can also use temporary variables to store characters that may indicate the beginning or the end of a comment without having to look at the character before or after the current one:
def remove_comments(content):
block_comment = False
line_comment = False
probably_a_comment = False
result = []
for character in content:
if not line_comment and not block_comment and character == '/':
probably_a_comment = True
continue
if block_comment and character == '*':
probably_a_comment = True
continue
if line_comment and character == '\n':
line_comment = False
result.append('\n')
elif block_comment and probably_a_comment and character == '/':
block_comment = False
elif not line_comment and not block_comment:
if probably_a_comment:
if character == '/':
line_comment = True
elif character == '*':
block_comment = True
else:
result.append('/') # Append the / we skipped when flagging that it was probably a comment starting
result.append(character)
else:
result.append(character)
probably_a_comment = False
return ''.join(result)You can also simplify a bit the memory management by using a generator instead of appending into a list:
def remove_comments(content):
def gen_content():
block_comment = False
line_comment = False
probably_a_comment = False
for character in content:
if not line_comment and not block_comment and character == '/':
probably_a_comment = True
continue
if block_comment and character == '*':
probably_a_comment = True
continue
if line_comment and character == '\n':
line_comment = False
yield '\n'
elif block_comment and probably_a_comment and character == '/':
block_comment = False
elif not line_comment and not block_comment:
if probably_a_comment:
if character == '/':
line_comment = True
elif character == '*':
block_comment = True
else:
yield '/'
yield character
else:
yield character
probably_a_comment = False
return ''.join(gen_content())If you want to go crazy, you can also use a state machine approach to simplify the code: no more boolean flags and far less comparisons in average:
def source_code(char):
if char == '/':
return comment_begin, ''
return source_code, char
def comment_begin(char):
if char == '/':
return inline_comment, ''
if char == '*':
return block_comment, ''
return source_code, '/'+char
def inline_comment(char):
if char == '\n':
return source_code, char
return inline_comment, ''
def block_comment(char):
if char == '*':
return end_block_comment, ''
return block_comment, ''
def end_block_comment(char):
if char == '/':
return source_code, ''
return block_comment, ''
def remove_comments(content):
def gen_content():
parser = source_code
for character in content:
parser, text = parser(character)
yield text
return ''.join(gen_content())But, all in all, this is far too complicated for the task at hand. You can get the same job done using a simple regular expression:
import re
COMMENTS = re.compile(r'''
(//[^\n]*(?:\n|$)) # Everything between // and the end of the line/file
| # or
(/\*.*?\*/) # Everything between /* and */
''', re.VERBOSE)
def remove_comments(content):
return COMMENTS.sub('\n', content)Code Snippets
/* This comment is /* a nested */ comment */def remove_comments(content):
block_comment = False
line_comment = False
probably_a_comment = False
result = []
for character in content:
if not line_comment and not block_comment and character == '/':
probably_a_comment = True
continue
if block_comment and character == '*':
probably_a_comment = True
continue
if line_comment and character == '\n':
line_comment = False
result.append('\n')
elif block_comment and probably_a_comment and character == '/':
block_comment = False
elif not line_comment and not block_comment:
if probably_a_comment:
if character == '/':
line_comment = True
elif character == '*':
block_comment = True
else:
result.append('/') # Append the / we skipped when flagging that it was probably a comment starting
result.append(character)
else:
result.append(character)
probably_a_comment = False
return ''.join(result)def remove_comments(content):
def gen_content():
block_comment = False
line_comment = False
probably_a_comment = False
for character in content:
if not line_comment and not block_comment and character == '/':
probably_a_comment = True
continue
if block_comment and character == '*':
probably_a_comment = True
continue
if line_comment and character == '\n':
line_comment = False
yield '\n'
elif block_comment and probably_a_comment and character == '/':
block_comment = False
elif not line_comment and not block_comment:
if probably_a_comment:
if character == '/':
line_comment = True
elif character == '*':
block_comment = True
else:
yield '/'
yield character
else:
yield character
probably_a_comment = False
return ''.join(gen_content())def source_code(char):
if char == '/':
return comment_begin, ''
return source_code, char
def comment_begin(char):
if char == '/':
return inline_comment, ''
if char == '*':
return block_comment, ''
return source_code, '/'+char
def inline_comment(char):
if char == '\n':
return source_code, char
return inline_comment, ''
def block_comment(char):
if char == '*':
return end_block_comment, ''
return block_comment, ''
def end_block_comment(char):
if char == '/':
return source_code, ''
return block_comment, ''
def remove_comments(content):
def gen_content():
parser = source_code
for character in content:
parser, text = parser(character)
yield text
return ''.join(gen_content())import re
COMMENTS = re.compile(r'''
(//[^\n]*(?:\n|$)) # Everything between // and the end of the line/file
| # or
(/\*.*?\*/) # Everything between /* and */
''', re.VERBOSE)
def remove_comments(content):
return COMMENTS.sub('\n', content)Context
StackExchange Code Review Q#148305, answer score: 6
Revisions (0)
No revisions yet.