patternjavaMinor
HTTP Authorization header parser
Viewed 0 times
parserauthorizationhttpheader
Problem
I'm writing a parser for HTTP Authorization header (see RFC2616#14.8 and RFC2617#1.2). Note that I explicitly don't care about the base64-encoded syntax used by HTTP Basic authentication. I'm only interested in the
Here's my code, which seems to parse the examples from the RFC just fine:
```
package com.example.sample;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AuthorizationHeaderParser {
private static final String SEPARATORS = "()<>@,;:\\\\\"/\\[\\]?={} \t";
private static final Pattern TOKEN_PATTERN = Pattern
.compile("[[\\p{ASCII}]&&[^" + SEPARATORS + "]&&[^\\p{Cntrl}]]+");
private static final Pattern EQ_PATTERN = Pattern.compile("=");
private static final Pattern TOKEN_QUOTED_PATTERN = Pattern
.compile("\"([^\"]|\\\\\\p{ASCII})*\"");
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private static final Pattern LWS_PATTERN = Pattern
.compile("(\r?\n)?[ \t]+");
private static class Tokenizer {
private String remaining;
public Tokenizer(String input) {
remaining = input;
}
private void skipSpaces() {
Matcher m = LWS_PATTERN.matcher(remaining);
if (!m.lookingAt()) {
return;
}
String match = m.group();
remaining = remaining.substring(match.length());
}
public String match(Pattern p) {
skipSpaces();
Matcher m = p.matcher(remaining);
if (!m.lookingAt()) {
return null;
}
String match = m.group(
auth-param syntax used by Digest authentication (to be more specific, I'm implementing a custom Authorization header similar to this question on SO). Basically, it's just a list of key=value pairs separated by commas and value could be quoted or unquoted.Here's my code, which seems to parse the examples from the RFC just fine:
```
package com.example.sample;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AuthorizationHeaderParser {
private static final String SEPARATORS = "()<>@,;:\\\\\"/\\[\\]?={} \t";
private static final Pattern TOKEN_PATTERN = Pattern
.compile("[[\\p{ASCII}]&&[^" + SEPARATORS + "]&&[^\\p{Cntrl}]]+");
private static final Pattern EQ_PATTERN = Pattern.compile("=");
private static final Pattern TOKEN_QUOTED_PATTERN = Pattern
.compile("\"([^\"]|\\\\\\p{ASCII})*\"");
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private static final Pattern LWS_PATTERN = Pattern
.compile("(\r?\n)?[ \t]+");
private static class Tokenizer {
private String remaining;
public Tokenizer(String input) {
remaining = input;
}
private void skipSpaces() {
Matcher m = LWS_PATTERN.matcher(remaining);
if (!m.lookingAt()) {
return;
}
String match = m.group();
remaining = remaining.substring(match.length());
}
public String match(Pattern p) {
skipSpaces();
Matcher m = p.matcher(remaining);
if (!m.lookingAt()) {
return null;
}
String match = m.group(
Solution
Going through your specific questions, I have the following suggestions:
OK, so what are the other suggestions.....
So, as an exercise, I took your code, and implemented both a state-machine and a Scanner implementation. I have used statemachines in the past to parse comma-separated value files, and the process was very fast... I figured it made sense here too. The Scanner is more complicated than I would have hoped, but you may find the implementation to be educational (I did).
As for a review of your code.... I found it 'easier' to write it again myself, than to try to understand yours. In a sense, that says a lot.
```
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AuthorizationHeaderParser {
/ *
* OP Mechanism
/
private static final String SEPARATORS = "()<>@,;:\\\\\"/\\[\\]?={} \t";
private static final Pattern TOKEN_PATTERN = Pattern
.compile("[[\\p{ASCII}]&&[^" + SEPARATORS + "]&&[^\\p{Cntrl}]]+");
private static final Pattern EQ_PATTERN = Pattern.compile("=");
private static final Pattern TOKEN_QUOTED_PATTERN = Pattern
.compile("\"([^\"]|\\\\\\p{ASCII})*\"");
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private static final Pattern LWS_PATTERN = Pattern
.compile("(\r?\n)?[ \t]+");
private static class Tokenizer {
private String remaining;
public Tokenizer(String input) {
remaining = input;
}
private void skipSpaces() {
Matcher m = LWS_PATTERN.matcher(remaining);
if (!m.lookingAt()) {
return;
}
String match = m.group();
remaining = remaining.substring(match.length());
}
public String match(Pattern p) {
skipSpaces();
Matcher m = p.matcher(remaining);
if (!m.lookingAt()) {
return null;
}
String match = m.group();
remaining = remaining.substring(match.length());
return match;
}
public String mustMatch(Pattern p) {
String match = match(p);
if (match == null) {
throw new NoSuchElementException();
}
return match;
}
public boolean hasMore() {
skipSpaces();
return remaining.length() > 0;
}
}
public static Map parse(String input) {
Tokenizer t = new Tokenizer(input);
Map map = new HashMap();
String authScheme = t.match(TOKEN_PATTERN);
map.put(":auth-scheme", authScheme);
while (true) {
while (t.match(COMMA_PATTERN) != null) {
// Skip null list elements
}
if (!t.hasMore()) {
break;
}
String key = t.mustMatch(TOKEN_PATTERN);
t.mustMatch(EQ_PATTERN);
String value = t.match(TOKEN_PATTERN);
if (value == null) {
value = t.mustMatch(TOKEN_QUOTED_PATTERN);
// trim quotes
value = value.substring(1, value.length() - 1);
}
map.put(key, value);
if (t.hasMore()) {
t.mustMatch(COMMA_PATTERN);
}
}
return map;
}
/ *
* State Machine Mechanism
/
private static enum ParseState{
PROLOGSPACE,
PROLOGWORD,
KEY,
KEYVALGAP,
VALUE,
QUOTEDVALUE,
SEPARATOR,
COMPLETE;
}
private static final String WHITESPAC
- Should you write a 'real' parser? - Depends. Parsers can be complicated, and they make assumptions. Regardless, you have already written your own parser, and it is 'real'.
- This is the BIG question... is it right? - With regexes it is often hard to tell, and it requires careful analysis of the regex and the data to find out. I have looked at your code, and inspected the regex, and, frankly, it was more complicated than I could easily understand in one sitting.... (and without 'playing' with the code). So, is it right? I don't know.
- Are the regexes too complicated (can they be simplified)? - yes, I would say yes to being too complicated, and unsure about whether they can be simplified.
- Other suggestions? - yes, a few.... which leads on to:
OK, so what are the other suggestions.....
- since you have a class called
Tokenizer, it is apparent you are breaking the code in to tokens.... why don't you just use the tools in Java to do the work for you?
- This problem is commonly solved with a State machine as well, which are sometimes much faster, and quite interesting.
So, as an exercise, I took your code, and implemented both a state-machine and a Scanner implementation. I have used statemachines in the past to parse comma-separated value files, and the process was very fast... I figured it made sense here too. The Scanner is more complicated than I would have hoped, but you may find the implementation to be educational (I did).
As for a review of your code.... I found it 'easier' to write it again myself, than to try to understand yours. In a sense, that says a lot.
```
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AuthorizationHeaderParser {
/ *
* OP Mechanism
/
private static final String SEPARATORS = "()<>@,;:\\\\\"/\\[\\]?={} \t";
private static final Pattern TOKEN_PATTERN = Pattern
.compile("[[\\p{ASCII}]&&[^" + SEPARATORS + "]&&[^\\p{Cntrl}]]+");
private static final Pattern EQ_PATTERN = Pattern.compile("=");
private static final Pattern TOKEN_QUOTED_PATTERN = Pattern
.compile("\"([^\"]|\\\\\\p{ASCII})*\"");
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private static final Pattern LWS_PATTERN = Pattern
.compile("(\r?\n)?[ \t]+");
private static class Tokenizer {
private String remaining;
public Tokenizer(String input) {
remaining = input;
}
private void skipSpaces() {
Matcher m = LWS_PATTERN.matcher(remaining);
if (!m.lookingAt()) {
return;
}
String match = m.group();
remaining = remaining.substring(match.length());
}
public String match(Pattern p) {
skipSpaces();
Matcher m = p.matcher(remaining);
if (!m.lookingAt()) {
return null;
}
String match = m.group();
remaining = remaining.substring(match.length());
return match;
}
public String mustMatch(Pattern p) {
String match = match(p);
if (match == null) {
throw new NoSuchElementException();
}
return match;
}
public boolean hasMore() {
skipSpaces();
return remaining.length() > 0;
}
}
public static Map parse(String input) {
Tokenizer t = new Tokenizer(input);
Map map = new HashMap();
String authScheme = t.match(TOKEN_PATTERN);
map.put(":auth-scheme", authScheme);
while (true) {
while (t.match(COMMA_PATTERN) != null) {
// Skip null list elements
}
if (!t.hasMore()) {
break;
}
String key = t.mustMatch(TOKEN_PATTERN);
t.mustMatch(EQ_PATTERN);
String value = t.match(TOKEN_PATTERN);
if (value == null) {
value = t.mustMatch(TOKEN_QUOTED_PATTERN);
// trim quotes
value = value.substring(1, value.length() - 1);
}
map.put(key, value);
if (t.hasMore()) {
t.mustMatch(COMMA_PATTERN);
}
}
return map;
}
/ *
* State Machine Mechanism
/
private static enum ParseState{
PROLOGSPACE,
PROLOGWORD,
KEY,
KEYVALGAP,
VALUE,
QUOTEDVALUE,
SEPARATOR,
COMPLETE;
}
private static final String WHITESPAC
Code Snippets
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Scanner;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class AuthorizationHeaderParser {
/* ****************************************
* OP Mechanism
* **************************************** */
private static final String SEPARATORS = "()<>@,;:\\\\\"/\\[\\]?={} \t";
private static final Pattern TOKEN_PATTERN = Pattern
.compile("[[\\p{ASCII}]&&[^" + SEPARATORS + "]&&[^\\p{Cntrl}]]+");
private static final Pattern EQ_PATTERN = Pattern.compile("=");
private static final Pattern TOKEN_QUOTED_PATTERN = Pattern
.compile("\"([^\"]|\\\\\\p{ASCII})*\"");
private static final Pattern COMMA_PATTERN = Pattern.compile(",");
private static final Pattern LWS_PATTERN = Pattern
.compile("(\r?\n)?[ \t]+");
private static class Tokenizer {
private String remaining;
public Tokenizer(String input) {
remaining = input;
}
private void skipSpaces() {
Matcher m = LWS_PATTERN.matcher(remaining);
if (!m.lookingAt()) {
return;
}
String match = m.group();
remaining = remaining.substring(match.length());
}
public String match(Pattern p) {
skipSpaces();
Matcher m = p.matcher(remaining);
if (!m.lookingAt()) {
return null;
}
String match = m.group();
remaining = remaining.substring(match.length());
return match;
}
public String mustMatch(Pattern p) {
String match = match(p);
if (match == null) {
throw new NoSuchElementException();
}
return match;
}
public boolean hasMore() {
skipSpaces();
return remaining.length() > 0;
}
}
public static Map<String, String> parse(String input) {
Tokenizer t = new Tokenizer(input);
Map<String, String> map = new HashMap<String, String>();
String authScheme = t.match(TOKEN_PATTERN);
map.put(":auth-scheme", authScheme);
while (true) {
while (t.match(COMMA_PATTERN) != null) {
// Skip null list elements
}
if (!t.hasMore()) {
break;
}
String key = t.mustMatch(TOKEN_PATTERN);
t.mustMatch(EQ_PATTERN);
String value = t.match(TOKEN_PATTERN);
if (value == null) {
value = t.mustMatch(TOKEN_QUOTED_PATTERN);
// trim quotes
value = value.substring(1, value.length() - 1);
}
map.put(key, value);
if (t.hasMore()) {
t.mustMatch(COMMA_PATTERN);
}
}
return map;
Context
StackExchange Code Review Q#41270, answer score: 2
Revisions (0)
No revisions yet.