patternjavaMinor
Build a sentence from tokens / words in a String-Array
Viewed 0 times
arraywordstokenssentencefromstringbuild
Problem
I'm facing an interesting issue at the moment:
My Situation:
I'm having (in Java) String-Arrays like the following (more complicated, of course). Each String-Array represents one sentence (I cant change the representation):
My Problem:
I want to rebuild the original sentences from this String-Arrays. This doesn't sound that hard at first, but becomes really complex since sentence structure can have many cases. Sometimes you need whitespaces and sometimes you don't.
My Approach:
I've implemented a method that should do most of the tasks, which means rebuilding a sentence from the original String-Array. As you can see, it's very complex and complicated already, but works "okay" for the moment - I don't know how to improve it at the moment.
```
public static String detokenize(String[] tokens) {
StringBuilder sentence = new StringBuilder();
boolean sentenceInQuotation = false;
boolean firstWordInQuotationSentence = false;
boolean firstWordInParenthisis = false;
boolean date = false;
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].equals(".") || tokens[i].equals(";") || tokens[i].equals(",") || tokens[i].equals("?") || tokens[i].equals("!")) {
sentence.append(tokens[i]);
}
else if(tokens[i].equals(":")){
Pattern p = Pattern.compile("\\d");
Matcher m = p.matcher(tokens[i-1]);
if(m.find() == true){
date = true;
}
sentence.append(tokens[i]);
}
else if(tokens[i].equals("(")){
sentence.append(" ");
sentence.append(tokens[i]);
firstWordInParenthisis = true;
}
else if (tokens[i].equals(")")) {
sentence.append(tokens[i]);
firstWordInParenthisis = false;
}
else if(tokens[i].equals("\"")){
if(sentenceInQuotation == false){
My Situation:
I'm having (in Java) String-Arrays like the following (more complicated, of course). Each String-Array represents one sentence (I cant change the representation):
String[] tokens = {"This", "is", "just", "an", "example", "."};My Problem:
I want to rebuild the original sentences from this String-Arrays. This doesn't sound that hard at first, but becomes really complex since sentence structure can have many cases. Sometimes you need whitespaces and sometimes you don't.
My Approach:
I've implemented a method that should do most of the tasks, which means rebuilding a sentence from the original String-Array. As you can see, it's very complex and complicated already, but works "okay" for the moment - I don't know how to improve it at the moment.
```
public static String detokenize(String[] tokens) {
StringBuilder sentence = new StringBuilder();
boolean sentenceInQuotation = false;
boolean firstWordInQuotationSentence = false;
boolean firstWordInParenthisis = false;
boolean date = false;
for (int i = 0; i < tokens.length; i++) {
if (tokens[i].equals(".") || tokens[i].equals(";") || tokens[i].equals(",") || tokens[i].equals("?") || tokens[i].equals("!")) {
sentence.append(tokens[i]);
}
else if(tokens[i].equals(":")){
Pattern p = Pattern.compile("\\d");
Matcher m = p.matcher(tokens[i-1]);
if(m.find() == true){
date = true;
}
sentence.append(tokens[i]);
}
else if(tokens[i].equals("(")){
sentence.append(" ");
sentence.append(tokens[i]);
firstWordInParenthisis = true;
}
else if (tokens[i].equals(")")) {
sentence.append(tokens[i]);
firstWordInParenthisis = false;
}
else if(tokens[i].equals("\"")){
if(sentenceInQuotation == false){
Solution
The OpenNLP will provide a more robust solution, but the following approximation may be good enough.
The general rule is to join the 'words' with a space between, there are three excpetions
The code underneath has been tested with this sentence:
A test, (string). Hello this is a 2nd sentence. Here is a quote: "This is the quote." Sentence 4.
And the test case...
The general rule is to join the 'words' with a space between, there are three excpetions
- Special punctuation characters that should not have a space before, eg . ; :
- Special punctuation characters that should not have a space after, eg ( [
- Quoted sentences in which case the " will start in case 2 and then switch after each occurrence
The code underneath has been tested with this sentence:
A test, (string). Hello this is a 2nd sentence. Here is a quote: "This is the quote." Sentence 4.
import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;
public class Detokenizer {
public String detokenize(List tokens) {
//Define list of punctuation characters that should NOT have spaces before or after
List noSpaceBefore = new LinkedList(Arrays.asList(",", ".",";", ":", ")", "}", "]"));
List noSpaceAfter = new LinkedList(Arrays.asList("(", "[","{", "\"",""));
StringBuilder sentence = new StringBuilder();
tokens.add(0, ""); //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
for (int i = 1; i < tokens.size(); i++) {
if (noSpaceBefore.contains(tokens.get(i))
|| noSpaceAfter.contains(tokens.get(i - 1))) {
sentence.append(tokens.get(i));
} else {
sentence.append(" " + tokens.get(i));
}
// Assumption that opening double quotes are always followed by matching closing double quotes
// This block switches the " to the other set after each occurrence
// ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
if ("\"".equals(tokens.get(i - 1))) {
if (noSpaceAfter.contains("\"")) {
noSpaceAfter.remove("\"");
noSpaceBefore.add("\"");
} else {
noSpaceAfter.add("\"");
noSpaceBefore.remove("\"");
}
}
}
return sentence.toString();
}
}And the test case...
import static org.junit.Assert.*;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import java.util.LinkedList;
public class DetokenizerTest {
@Test
public void test() {
List tokens = new LinkedList(Arrays.asList("A", "test", ",", "(", "string", ")", ".", "Hello","this","is","a","2nd","sentence",".","Here","is","a","quote",":","\"","This","is","the","quote",".","\"","Sentence","4","."));
String expected = "A test, (string). Hello this is a 2nd sentence. Here is a quote: \"This is the quote.\" Sentence 4.";
String actual = new Detokenizer().detokenize(tokens);
assertEquals(expected, actual);
System.out.println(actual);
}
}Code Snippets
import java.util.Arrays;
import java.util.List;
import java.util.LinkedList;
public class Detokenizer {
public String detokenize(List<String> tokens) {
//Define list of punctuation characters that should NOT have spaces before or after
List<String> noSpaceBefore = new LinkedList<String>(Arrays.asList(",", ".",";", ":", ")", "}", "]"));
List<String> noSpaceAfter = new LinkedList<String>(Arrays.asList("(", "[","{", "\"",""));
StringBuilder sentence = new StringBuilder();
tokens.add(0, ""); //Add an empty token at the beginning because loop checks as position-1 and "" is in noSpaceAfter
for (int i = 1; i < tokens.size(); i++) {
if (noSpaceBefore.contains(tokens.get(i))
|| noSpaceAfter.contains(tokens.get(i - 1))) {
sentence.append(tokens.get(i));
} else {
sentence.append(" " + tokens.get(i));
}
// Assumption that opening double quotes are always followed by matching closing double quotes
// This block switches the " to the other set after each occurrence
// ie The first double quotes should have no space after, then the 2nd double quotes should have no space before
if ("\"".equals(tokens.get(i - 1))) {
if (noSpaceAfter.contains("\"")) {
noSpaceAfter.remove("\"");
noSpaceBefore.add("\"");
} else {
noSpaceAfter.add("\"");
noSpaceBefore.remove("\"");
}
}
}
return sentence.toString();
}
}import static org.junit.Assert.*;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import java.util.LinkedList;
public class DetokenizerTest {
@Test
public void test() {
List<String> tokens = new LinkedList<String>(Arrays.asList("A", "test", ",", "(", "string", ")", ".", "Hello","this","is","a","2nd","sentence",".","Here","is","a","quote",":","\"","This","is","the","quote",".","\"","Sentence","4","."));
String expected = "A test, (string). Hello this is a 2nd sentence. Here is a quote: \"This is the quote.\" Sentence 4.";
String actual = new Detokenizer().detokenize(tokens);
assertEquals(expected, actual);
System.out.println(actual);
}
}Context
StackExchange Code Review Q#11116, answer score: 4
Revisions (0)
No revisions yet.