patterncppMinor

Serialization: Escape Input String

Submitted by: @import:stackexchange-codereview·Mar 10, 2026·

Viewed 0 times

inputserializationstringescape

Problem

In Json strings characters can be escaped with \\.
Here is an iterator that can read such strings and convert the escaped characters to UTF-8

```
#ifndef THORSANVIL_SERIALIZATION_UNICODE_ITERATOR_H
#define THORSANVIL_SERIALIZATION_UNICODE_ITERATOR_H

#include

namespace ThorsAnvil
{
namespace Serialization
{

namespace
{

long convertHexToDec(char x)
{
if (x >= '0' && x = 'A' && x = 'a' && x
struct UnicodePushBackIterator: std::iterator
{
C& cont;
bool lastWasSlash;
int unicodeCount;
uint32_t unicodeValue;
UnicodePushBackIterator(C& c)
: cont(c)
, lastWasSlash(false)
, unicodeCount(0)
{}
UnicodePushBackIterator& operator++() {return *this;}
UnicodePushBackIterator& operator() {return this;}
void operator=(char x)
{
if (unicodeCount)
{
if (unicodeCount == 6)
{
if (x != '\\')
{
throw std::runtime_error("ThorsAnvil::Serialization::UnicodeIterator: Push->Surrogate pair(No Slash): \\uD8xx Must be followed by \\uDCxx");
}
--unicodeCount;
}
else if (unicodeCount == 5)
{
if (x != 'u')
{
throw std::runtime_error("ThorsAnvil::Serialization::UnicodeIterator: Push->Surrogate pair(No u): \\uD8xx Must be followed by \\uDCxx");
}
--unicodeCount;
}
else
{
unicodeValue > 6)));
cont.push_back(0x80 |((unicodeValue >> 0) & 0x3F));
}
else if (unicodeValue > 12)));
cont.push_back(0x80 |((unicodeValue >> 6) & 0x3F));
cont.push_back(0x80 |((unicodeValue >> 0) & 0x3F));
}
else
{

Solution

The newsletter suggested I try to answer this, so here is my attempt:

This looks like a lot of repeated code here. I don't think this can be shortened really, but I would create a function to do the conversion in:

switch(x)
{
    case '"':   cont.push_back('"');    break;
    case '\\':  cont.push_back('\\');   break;
    case '/':   cont.push_back('/');    break;
    case 'b':   cont.push_back('\b');   break;
    case 'f':   cont.push_back('\f');   break;
    case 'n':   cont.push_back('\n');   break;
    case 'r':   cont.push_back('\r');   break;
    case 't':   cont.push_back('\t');   break;
    case 'u':   unicodeCount = 4; unicodeValue = 0; break;
}

More like this:

char convertChar (char c) {
    switch(x)
    {
        case '"':
        case '\\':
        case '/':
            return x;

        case 'b':   return '\b';
        case 'f':   return '\f';
        case 'n':   return '\n';
        case 'r':   return '\r';
        case 't':   return '\t';
    }
}

Then, you can change the switch statement like this:

if (x == 'u')
{
    unicodeCount = 4;
    unicodeValue = 0;
}
else
{
    cont.push_back(convertChar(x));
}

Further down, you could reuse that again:

switch(next)
{
    case '"':
    case '\\':
    case '/':
    case 'b':
    case 'f':
    case 'n':
    case 'r':
    case 't':
        cont[0] = convertChar(next); break;

    case 'u':   decodeUnicode();  break;
    default:    cont[0] = next;   break;
}

Otherwise, it looks good to me.

Context

StackExchange Code Review Q#79281, answer score: 3

Revisions (0)

No revisions yet.