Monday, March 9, 2009

LLSD scanning with ragel-made parser.

I've written a while ago about the Ragel.

Today evening (literally in about 4 hours), I tried it and cooked a semi-functional parser for LLSD XML serialization. Yes, the code below was cranked from scratch in just a few hours :)

What's nice is that it is can parse the data by chunks of practically any size, so could be used to grab the data over a slower TCP connection. And - the base64 decoding is included! Things that would need to be done in order to make it usable include proper error handling and a better approach to handle possibly large data - right now it is kind of dumb...

Here's the result of running it over the sample XML from the draft:


$ ./a.out `cat ../sample.llsd`
Array start
Integer: '42'
UUID: '6bad258e06f04a87a659493117c9c162'
Map start
String: 'cold'
Map member with key 'hot':'cold'
Undef
Map member with key 'higgs_boson_rest_mass':''
URI: 'https://example.org/r/6bad258e-06f0-4a87-a659-493117c9c162'
Map member with key 'info_page':'https://example.org/r/6bad258e-06f0-4a87-a659-493117c9c162'
Date: '2008-10-13T19:00.00Z'
Map member with key 'status_report_due_by':'2008-10-13T19:00.00Z'
Map end
Array end
result = 1


And here goes the .rl code in case you're curious (maybe you need to "view source" in case it gets garbled):


#include
#include
#include

typedef struct llsd_scanner {
char *accum;
int accum_size;
int accum_index;

char *key_accum;
int key_accum_size;
int key_accum_index;

unsigned char *bin_accum;
int bin_accum_size;
int bin_accum_index;
unsigned char b64chars[4];

// ragel variables
int *stack;
int stack_size;
int top;
int act;
int cs;
unsigned char *ts, *te;

} llsd_scanner_t;



static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq";

static void llsd_scanner_putb64char(llsd_scanner_t *scanner, int index, unsigned char v)
{
v = (unsigned char) ((v < 43 || v > 122) ? 0 : cd64[ v - 43 ]);
if( v ) {
v = (unsigned char) ((v == '$') ? 0 : v - 61);
}
if (v) {
scanner->b64chars[index] = (unsigned char) (v - 1);
} else {
scanner->b64chars[index] = v;
}
}

static void b64decodeblock( unsigned char in[4], unsigned char out[3] )
{
out[ 0 ] = (unsigned char ) (in[0] << 2 | in[1] >> 4);
out[ 1 ] = (unsigned char ) (in[1] << 4 | in[2] >> 2);
out[ 2 ] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]);
}


static int append_binchars(llsd_scanner_t *scanner)
{
if (scanner->bin_accum_index + 4 < scanner->bin_accum_size) {
b64decodeblock(scanner->b64chars, &scanner->bin_accum[scanner->bin_accum_index]);
scanner->bin_accum_index += 3;
return -1;
} else {
printf("Bin accumulator overflow!");
return 0;
}
}

static void clear_accum(char *accum, int *accum_index)
{
*accum_index = 0;
}

static int append_accum(char *accum, int accum_size, int *accum_index, unsigned char c)
{
if (*accum_index < accum_size-1) {
accum[(*accum_index)++] = c;
return -1;
} else {
return 0;
}
}

static void terminate_accum(char *accum, int *accum_index)
{
accum[*accum_index] = 0;
}




%%{

machine LLSDScanner;

access scanner->;



action terminate_accum {
terminate_accum(scanner->accum, &scanner->accum_index);
printf("Accumulated: '%s'\n", scanner->accum);
}
action clear_accum {
clear_accum(scanner->accum, &scanner->accum_index);
clear_accum(scanner->bin_accum, &scanner->bin_accum_index);
}
action append_accum {
append_accum(scanner->accum, scanner->accum_size, &(scanner->accum_index), fc);
}

action print_key_accum {
terminate_accum(scanner->key_accum, &scanner->key_accum_index);
// printf("Key Accumulated: '%s'\n", scanner->key_accum);
}

action clear_key_accum {
clear_accum(scanner->key_accum, &scanner->key_accum_index);
}
action append_key_accum {
append_accum(scanner->key_accum, scanner->key_accum_size, &scanner->key_accum_index, fc);
}

action convert_string {
terminate_accum(scanner->accum, &scanner->accum_index);
printf("String: '%s'\n", scanner->accum);
}
action convert_uuid {
terminate_accum(scanner->accum, &scanner->accum_index);
printf("UUID: '%s'\n", scanner->accum);
}
action convert_uri {
terminate_accum(scanner->accum, &scanner->accum_index);
printf("URI: '%s'\n", scanner->accum);
}
action convert_base64 {
terminate_accum(scanner->bin_accum, &(scanner->bin_accum_index));
printf("Binary Accumulated: '%s'\n", scanner->bin_accum);
}
action convert_date {
terminate_accum(scanner->accum, &(scanner->accum_index));
printf("Date: '%s'\n", scanner->accum);
}
action convert_integer {
terminate_accum(scanner->accum, &(scanner->accum_index));
printf("Integer: '%s'\n", scanner->accum);
}
action convert_real {
terminate_accum(scanner->accum, &(scanner->accum_index));
printf("Real: '%s'\n", scanner->accum);
}
action convert_undef {
terminate_accum(scanner->accum, &(scanner->accum_index));
strcpy(scanner->accum, "");
printf("Undef\n");
}

action assign_map_member {
printf("Map member with key '%s':'%s'\n", scanner->key_accum, scanner->accum);
}

one_or_two = /[12]/ @append_accum;
zero_or_one = /[01]/ @append_accum;
zero_to_nine = /[0-9]/ @append_accum;
zero_to_two = /[0-2]/ @append_accum;
zero_to_three = /[0-3]/ @append_accum;
zero_to_five = /[0-5]/ @append_accum;
dot = '.' @append_accum;
dash = '-' @append_accum;
colon = ':' @append_accum;
dot_or_colon = /[.:]/ @append_accum;
letter_T = 'T' @append_accum;
letter_Z = 'Z' @append_accum;

date_fullyear = one_or_two . zero_to_nine . zero_to_nine . zero_to_nine;
date_month = zero_or_one . zero_to_nine;
date_mday = zero_to_three . zero_to_nine;
time_hour = zero_to_two . zero_to_nine;
time_minute = zero_to_five . zero_to_nine;
time_sec = zero_to_five . zero_to_nine;
time_secfrac = dot . zero_to_nine;
full_date = date_fullyear . dash . date_month . dash . date_mday;
partial_time = ( time_hour . colon . time_minute . colon . time_sec . time_secfrac | time_hour . colon . time_minute . dot_or_colon . time_sec );
date_time = full_date . letter_T . partial_time . letter_Z;

maybe_space = space*;

some_string_char = /[^<]/ @append_accum;
some_string = some_string_char*;

some_key_string_char = /[^<]/ @append_key_accum;
some_key_string = some_key_string_char*;

# base64 parsing. Base64 characters come in series of 4

base64_char = [ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+=/];
action base64_one { llsd_scanner_putb64char(scanner, 0, fc); }
action base64_two { llsd_scanner_putb64char(scanner, 1, fc); }
action base64_three { llsd_scanner_putb64char(scanner, 2, fc); }
action base64_four { llsd_scanner_putb64char(scanner, 3, fc); }
action base64_four_chars { append_binchars(scanner); }

base64_first_char = base64_char @base64_one;
base64_second_char = base64_char @base64_two;
base64_third_char = base64_char @base64_three;
base64_fourth_char = base64_char @base64_four;

base64_four_chars = maybe_space . base64_first_char . maybe_space . base64_second_char . maybe_space .
base64_third_char . maybe_space . base64_fourth_char @base64_four_chars;

base64_string = base64_four_chars+;

tag_start = maybe_space . '<';
tag_end_nospace = ' tag_end = maybe_space . tag_end_nospace;

llsd_tag = /llsd>/i;

undef_tag = /undef\/>/i;
boolean_tag = /boolean>/i;
integer_tag = /integer>/i;
real_tag = /real>/i;
uuid_tag = /uuid>/i;
string_tag = /string>/i;
date_tag = /date>/i;
uri_tag = /uri>/i;
binary_tag = /binary/i . (space . /encoding="base64"/i)? . '>';

key_tag = /key>/i;
map_tag = /map>/i;
array_tag = /array>/i;

action call_array {
printf("Array start\n");
fcall array_element;
}
action call_map {
printf("Map start\n");
fcall map_element;
}
action return_from_map {
printf("Map end\n");
fret;
}
action return_from_array {
printf("Array end\n");
fret;
}

real_number = /[0-9]*/ @append_accum;
integer_number = /[0-9]*/ @append_accum;

hex_char = /[0-9a-fA-F]/ @append_accum;
uuid_val = hex_char{8,8} . '-'? . hex_char{4,4} . '-'? . hex_char{4,4} . '-'? . hex_char{4,4} . '-'? . hex_char{12,12};


action error { printf("Error!\n"); }
some_element = tag_start . (undef_tag @convert_undef
| boolean_tag @clear_accum . ('true'|'false') . tag_end . boolean_tag
| integer_tag @clear_accum . integer_number . tag_end . integer_tag @convert_integer
| real_tag @clear_accum . real_number . tag_end . real_tag @convert_real
| uuid_tag @clear_accum . uuid_val . tag_end . uuid_tag @convert_uuid
| string_tag @clear_accum . some_string . tag_end_nospace . string_tag @convert_string
| date_tag @clear_accum . date_time . tag_end . date_tag @convert_date
| uri_tag @clear_accum . some_string . tag_end . uri_tag @convert_uri
| binary_tag @clear_accum . base64_string . tag_end . binary_tag @convert_base64
| array_tag @call_array
| map_tag @call_map
);



array_element := (some_element)* . tag_end . array_tag @return_from_array;

map_key_start = tag_start . key_tag @clear_key_accum;

map_key_end = '
map_member = map_key_start . some_key_string . map_key_end . some_element @assign_map_member;

map_element := (map_member)* . tag_end . map_tag @return_from_map;

body = (some_element)?;

xml_preamble = /xml[^>]*/i . '>';

llsd_body = tag_start . ( '?' . xml_preamble . tag_start )? . llsd_tag . body . tag_end . llsd_tag;

main := llsd_body @{ res = 1; };

}%%

%% write data;


void llsd_scanner_free(llsd_scanner_t *scanner)
{
if(scanner->stack) { free(scanner->stack); }
if(scanner->key_accum) { free(scanner->key_accum); }
if(scanner->accum) { free(scanner->accum); }
if(scanner->bin_accum) { free(scanner->bin_accum); }
free(scanner);
}

llsd_scanner_t *llsd_scanner_alloc(int key_accum_size, int accum_size, int bin_accum_size, int stack_size)
{
int failure = 0;
llsd_scanner_t *scanner = calloc(1, sizeof(llsd_scanner_t));
if (scanner) {
scanner->bin_accum = malloc(bin_accum_size);
if (scanner->bin_accum) {
scanner->bin_accum_size = bin_accum_size;
scanner->accum = malloc(accum_size);
if (scanner->accum) {
scanner->accum_size = accum_size;
scanner->key_accum = malloc(key_accum_size);
if (scanner->key_accum) {
scanner->key_accum_size = key_accum_size;
scanner->stack = calloc(sizeof(int), stack_size);
if(scanner->stack) {
scanner->stack_size = stack_size;
%% write init;
} else {
failure = 1;
}
} else {
failure = 1;
}
} else {
failure = 1;
}
} else {
failure = 1;
}
if (failure) {
llsd_scanner_free(scanner);
scanner = NULL;
}
}
return scanner;
}

int llsd_scanner_run(llsd_scanner_t *scanner, char *chunk)
{
char *p = chunk;
char *pe = chunk + strlen(chunk);
int res = 0;
// printf("Running parser on: %s, curr state: %i\n", chunk, scanner->cs);
%% write exec;
if (scanner->cs && res == 0) {
res = -1;
}
return res;
}


int main( int argc, char **argv )
{
int res = 0;
int i = 1;
llsd_scanner_t *scan;
if ( argc > 1 ) {
scan = llsd_scanner_alloc(1000, 1000, 32768, 100);
while(i < argc) {
res = llsd_scanner_run(scan, argv[i]);
if (res == 0) {
printf("Error on chunk %d\n", i);
i = argc;
} else {
}
i++;
}
llsd_scanner_free(scan);
}
printf("result = %i\n", res);
return 0;
}


No comments: