diff options
Diffstat (limited to 'cscmd/lex.c')
-rw-r--r-- | cscmd/lex.c | 815 |
1 files changed, 815 insertions, 0 deletions
diff --git a/cscmd/lex.c b/cscmd/lex.c new file mode 100644 index 0000000..318e074 --- /dev/null +++ b/cscmd/lex.c @@ -0,0 +1,815 @@ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <stdarg.h> +#include <limits.h> +#include <locale.h> +#include <wchar.h> +#include <wctype.h> + +#define PCRE2_CODE_UNIT_WIDTH 32 +#include <pcre2.h> + +#include <defs.h> + +#include <main.h> +#include <error.h> +#include <msglog.h> +#include <xalloc.h> +#include <utf8ing.h> +#include <symtab.h> +#include <parse.h> + +#include <lex.h> + + + +int lineno = 0; +int colno = 0; + +static int maxtoken; +static wchar_t *token_buffer; + +static int max8token; +static utf8_t *token_utf8_buffer; + +int indent_level = 0; /* Number of '{' minus number of '}'. */ + +static int end_of_file = 0; +static int nextchar = -1; + +static char *locale; + +#define GETC(c) ({ wint_t ret; ++colno; ret = fgetwc( config ); ret; }) +#define UNGETC(c) ({ wint_t ret; --colno; ret = ungetwc( c, config ); ret; }) + + +static wchar_t *extend_token_buffer( wchar_t *p ) +{ + int offset = p - token_buffer; + maxtoken = maxtoken * 2 + 10; + token_buffer = (wchar_t *)xrealloc( token_buffer, (maxtoken + 2)*sizeof(wchar_t) ); + + return( token_buffer + offset ); +} + +static utf8_t *extend_token_utf8_buffer( utf8_t *p ) +{ + int offset = p - token_utf8_buffer; + max8token = max8token * 2 + 10; + token_utf8_buffer = (utf8_t *)xrealloc( token_utf8_buffer, (max8token + 2)*6 ); + + return( token_utf8_buffer + offset ); +} + + +void yyerror( char const *s ) +{ + error( "%s", s ); +} + + +void init_lex( void ) +{ + locale = setlocale( LC_ALL, "en_US.utf8" ); + + lineno = 0; + colno = 0; + + nextchar = -1; + maxtoken = 40; + max8token = 40; + + indent_level = 0; + end_of_file = 0; + + token_buffer = (wchar_t *)xmalloc( maxtoken * sizeof(wchar_t) + 2 ); + token_utf8_buffer = (utf8_t *)xmalloc( max8token * 6 + 2 ); +} + +void fini_lex( void ) +{ + locale = setlocale( LC_ALL, locale ); + + if( token_buffer ) { free( token_buffer ); token_buffer = NULL; } + if( token_utf8_buffer ) { free( token_utf8_buffer ); token_utf8_buffer = NULL; } + + indent_level = 0; + end_of_file = 0; + + max8token = 0; + maxtoken = 0; + nextchar = -1; + + lineno = 0; + colno = 0; +} + +static wint_t check_newline( void ) +{ + wint_t c; + + ++lineno; + colno = 0; /* считает GETC()/UNGETC(); здесь надо только обнулить */ + + /***************************************** + Read first nonwhite char on the line. + *****************************************/ + c = GETC(); + while( c == ' ' || c == '\t' ) c = GETC(); + + if( c == '#' ) goto skipline; + else return( c ); + + /* skip the rest of this line */ +skipline: + + while( c != '\n' && c != WEOF ) + c = GETC(); + + return( c ); +} + +static wint_t skip_comment( int c ) +{ + if( c == '*' ) + { +do1: + do + { + c = GETC(); + if( c == '\n' ) { ++lineno; colno = 0; } + + } while( c != '*' && c != WEOF ); + + if( c == WEOF ) + { + unterminated_comment(); + return( WEOF ); + } + + c = GETC(); + + if( c == '/' ) + { + c = GETC(); + if( c == '\n' ) c = check_newline(); + return( c ); + } + else + { + UNGETC( c ); + goto do1; + } + } + else if( c == '/' || c == '#' ) + { + do + { + c = GETC(); + + } while( c != '\n' && c != WEOF ); + + if( c == WEOF ) + { + unterminated_comment(); + return( WEOF ); + } + else c = check_newline(); + + return( c ); + } + + return( c ); + +} /* End skip_commemnt() */ + +static wint_t skip_white_space( wint_t c ) +{ + for( ;; ) + { + switch( c ) + { + case '\n': + c = check_newline(); + break; + + case '#': + c = skip_comment( c ); + return( skip_white_space( c ) ); + break; + + case '/': + c = GETC(); + if( c == '/' || c == '*' ) + { + c = skip_comment( c ); + return( skip_white_space( c ) ); + } + else + { + UNGETC( c ); + return( '/' ); + } + break; + + case ' ': + case '\t': + case '\f': + case '\v': + case '\b': + case '\r': + c = GETC(); + break; + case '\\': + c = GETC(); + if( c == '\n' ) { ++lineno; colno = 0; } + else + { + warning( "%s", "Stray '\\' in program" ); + } + c = GETC(); + break; + default: + return( c ); + + } /* End switch( c ) */ + + } /* End for( ;; ) */ + +} /* End skip_white_space() */ + +static wint_t readescape( int *ignore_ptr ) +/* + read escape sequence, returning a char, or store 1 in *ignore_ptr + if it is backslash-newline + */ +{ + wint_t c = GETC(); + wint_t code; + unsigned count; + unsigned firstdig = 0; + int nonull; + + switch( c ) + { + case 'x': + code = 0; + count = 0; + nonull = 0; + while( 1 ) + { + c = GETC(); + if( !(c >= 'a' && c <= 'f') && + !(c >= 'A' && c <= 'F') && + !(c >= '0' && c <= '9') ) + { + UNGETC( c ); + break; + } + code *= 16; + if( c >= 'a' && c <= 'f' ) code += c - 'a' + 10; + if( c >= 'A' && c <= 'F' ) code += c - 'A' + 10; + if( c >= '0' && c <= '9' ) code += c - '0'; + if( code != 0 || count != 0 ) + { + if( count == 0 ) firstdig = code; + count++; + } + nonull = 1; + + } /* End while( 1 ) */ + + if( !nonull ) + { + error( "%s", "\\x used with no following hex digits" ); + } + else if( count == 0 ) + /* Digits are all 0's. Ok. */ + ; + else if( (count - 1) * 4 >= 32 || /* 32 == bits per INT */ + (count > 1 && ((1 << (32 - (count-1) * 4)) <= firstdig ))) + { + warning( "%s", "Hex escape out of range" ); + } + return( code ); + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': + code = 0; + count = 0; + while( (c <= '7') && (c >= '0') && (count++ < 6) ) + { + code = (code * 8) + (c - '0'); + c = GETC(); + } + UNGETC( c ); + return( code ); + + case '\\': case '\'': case '"': + return( c ); + + case '\n': + lineno++; colno = 0; + *ignore_ptr = 1; + return( 0 ); + + case 'n': + return( '\n' ); + + case 't': + return( '\t' ); + + case 'r': + return( '\r' ); + + case 'f': + return( '\f' ); + + case 'b': + return( '\b' ); + + case 'a': + return( '\a' ); + + case 'v': + return( '\v' ); + } + + return( c ); + +} /* End of readescape() */ + + +int html_symbol_name( wchar_t *str ) +{ + int rc = 0, error = 0; + PCRE2_SIZE offset = 0; + wchar_t pattern[] = L"^(&[#A-Za-z0-9]*;)"; + + pcre2_match_data *match; + + pcre2_code *regexp = pcre2_compile( (PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &error, &offset, NULL ); + if( regexp == NULL ) + { + return 0; /* PCRE compilation failed */ + } + + match = pcre2_match_data_create_from_pattern( regexp, NULL ); + + rc = pcre2_match( regexp, (PCRE2_SPTR)str, (int)wcslen(str), 0, 0, match, NULL ); + if( rc < 0 ) + { + /* not match */ + pcre2_match_data_free( match ); + pcre2_code_free( regexp ); + return 0; + } + else + { + /* match */ + pcre2_match_data_free( match ); + pcre2_code_free( regexp ); + return 1; + } +} + + +int yylex( void ) +{ + wint_t c; + wchar_t *p; + int value; + + if( nextchar >= 0 ) + c = nextchar, nextchar = -1; + else + c = GETC(); + + while( 1 ) + { + switch( c ) + { + case ' ': + case '\t': + case '\f': + case '\v': + case '\b': + c = skip_white_space( c ); + break; + + case '\r': + case '\n': + case '/': + case '#': + case '\\': + c = skip_white_space( c ); + + default: + goto found_nonwhite; + + } /* End switch( c ) */ +found_nonwhite: + + token_buffer[0] = c; + token_buffer[1] = 0; + + switch( c ) + { + case WEOF: + end_of_file = 1; + token_buffer[0] = 0; + value = 0; + goto done; + break; + + case '$': /* dollar in identifier */ + if( 1 ) goto letter; + return '$'; + + case 'A': case 'B': case 'C': case 'D': case 'E': + case 'F': case 'G': case 'H': case 'I': case 'J': + case 'K': case 'L': case 'M': case 'N': case 'O': + case 'P': case 'Q': case 'R': case 'S': case 'T': + case 'U': case 'V': case 'W': case 'X': case 'Y': + case 'Z': + case 'a': case 'b': case 'c': case 'd': case 'e': + case 'f': case 'g': case 'h': case 'i': case 'j': + case 'k': case 'l': case 'm': case 'n': case 'o': + case 'p': case 'q': case 'r': case 's': case 't': + case 'u': case 'v': case 'w': case 'x': case 'y': + case 'z': + case '_': + + /* RUSSIAN */ + case L'А': case L'Б': case L'В': case L'Г': case L'Д': + case L'Е': case L'Ё': case L'Ж': case L'З': case L'И': + case L'Й': case L'К': case L'Л': case L'М': case L'Н': + case L'О': case L'П': case L'Р': case L'С': case L'Т': + case L'У': case L'Ф': case L'Х': case L'Ц': case L'Ч': + case L'Ш': case L'Щ': case L'Ъ': case L'Ы': case L'Ь': + case L'Э': case L'Ю': case L'Я': + + case L'а': case L'б': case L'в': case L'г': case L'д': + case L'е': case L'ё': case L'ж': case L'з': case L'и': + case L'й': case L'к': case L'л': case L'м': case L'н': + case L'о': case L'п': case L'р': case L'с': case L'т': + case L'у': case L'ф': case L'х': case L'ц': case L'ч': + case L'ш': case L'щ': case L'ъ': case L'ы': case L'ь': + case L'э': case L'ю': case L'я': + +letter: + p = token_buffer; + while( iswalnum( c ) || c == '_' || c == '$' || c == '@' || c == '-' || c == '.' || c == ':' ) + { + if( p >= token_buffer + maxtoken ) + { + p = extend_token_buffer( p ); + extend_token_utf8_buffer( token_utf8_buffer ); + } + + *p++ = c; + c = GETC(); + } + *p = 0; + nextchar = c; + value = VARIABLE; + + (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer ); + + /********************* + install into symtab + *********************/ + { + if( !strcmp( "section", (const char *)token_utf8_buffer ) ) + { + value = SECTION; + yylval.sym = install( NULL, SECTION, NULL ); + } + else if( !strcmp( "repo", (const char *)token_utf8_buffer ) ) + { + value = REPO; + yylval.sym = install( NULL, REPO, NULL ); + } + else + { + SYMBOL *sp = NULL; + + if( (sp = lookup( (const char *)token_utf8_buffer )) == (SYMBOL *)0 ) + sp = install( (const char *)token_utf8_buffer, VARIABLE, 0 ); + + /****************************************************************** + Если переменная уже в таблице, то мы предполагаем, что она имеет + тип равный одному из допустимых: NUMERICAL, STRING, или PATH. + ******************************************************************/ + if( sp->type != VARIABLE ) + { + switch( sp->type ) + { + case NUMERICAL: + case STRING: + case PATH: + value = sp->type; + break; + default: + /* error */ + break; + } + } + yylval.sym = sp; + } + } + + token_buffer[0] = 0; + token_utf8_buffer[0] = 0; + goto done; + break; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + { + int constant = 0; +/* integer: */ + p = token_buffer; + while( iswdigit( c ) ) + { + if( p >= token_buffer + maxtoken ) + { + p = extend_token_buffer( p ); + extend_token_utf8_buffer( token_utf8_buffer ); + } + + *p++ = c; + c = GETC(); + } + *p = 0; + nextchar = c; + value = NUMERICAL; + + (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer ); + + /********************* + install into symtab + *********************/ + { + (void)swscanf( (const wchar_t *)token_buffer, L"%d", &constant ); + yylval.sym = install( NULL, NUMERICAL, constant ); + } + + token_buffer[0] = 0; + token_utf8_buffer[0] = 0; + goto done; + break; + } + + case '\'': +/* path_constant: */ + { + int num_chars = 0; + unsigned int width = 8; /* to allow non asscii in path set width = 16 */ + + while( 1 ) + { +tryagain: + c = GETC(); + + if( c == '\'' || c == WEOF ) break; + if( c == '\\' ) + { + int ignore = 0; + c = readescape( &ignore ); + if( ignore ) goto tryagain; + if( (unsigned)c >= (1 << width) ) + { + warning( "%s", "Escape sequence out of range" ); + } + } + else if( c == '\n' ) { lineno++; colno = 0; } + + num_chars++; + if( num_chars > maxtoken - 4 ) + { + extend_token_buffer( token_buffer ); + extend_token_utf8_buffer( token_utf8_buffer ); + } + + token_buffer[num_chars] = c; + + } /* End while( 1 ) */ + + token_buffer[num_chars + 1] = '\''; + token_buffer[num_chars + 2] = 0; + + if( c != '\'' ) + { + error( "%s", "Malformated path constant" ); + } + else if( num_chars == 0 ) + { + error( "%s", "Empty path constant" ); + } + + /* build path: */ + { + wchar_t *s, *string = NULL; + wchar_t *p = &token_buffer[0]; + + while( *p ) + { + if( *p == '\n' || *p == '\t' ) *p = ' '; + ++p; + } + + string = (wchar_t *)malloc( maxtoken * 4 + 10 ); + + p = &token_buffer[1]; + s = &string[0]; + + while( *p == ' ' ) ++p; + + while( *p ) + { + if( *p != ' ' ) + *s++ = *p++; + else + ++p; + } + --s; *s = 0; + while( *(s-1) == ' ' ) --s; + *s = 0; + + (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string ); + + free( string ); + } + + /********************* + install into symtab + *********************/ + { + yylval.sym = install( NULL, PATH, (char *)token_utf8_buffer ); + } + + token_buffer[0] = 0; + token_utf8_buffer[0] = 0; + value = PATH; + goto done; + } + + case '"': +/* string_constant: */ + { + c = GETC(); + p = token_buffer + 1; + + while( c != '"' && c >= 0 ) + { + if( c == '\\' ) + { + int ignore = 0; + c = readescape( &ignore ); + if( ignore ) goto skipnewline; + } + else if( c == '\n' ) lineno++; + + if( p == token_buffer + maxtoken ) + { + p = extend_token_buffer( p ); + extend_token_utf8_buffer( token_utf8_buffer ); + } + *p++ = c; + +skipnewline: + c = GETC(); + + } /* End while( " ) */ + + *p = 0; + + if( c < 0 ) + { + error( "%s", "Unterminated string constant" ); + } + + + *p++ = '"'; + *p = 0; + + /* build string: */ + { + wchar_t *s, *string = NULL; + wchar_t *p = &token_buffer[0]; + + while( *p ) + { + if( *p == '\n' || *p == '\t' ) *p = ' '; + ++p; + } + + string = (wchar_t *)malloc( maxtoken * 4 + 10 ); + + p = &token_buffer[1]; + s = &string[0]; + + while( *p == ' ' ) ++p; + + while( *p ) + { + if( *p != ' ' ) + { + switch( *p ) + { + case '&': + /************************************************ + Skip HTML symbol names such as  ,... etc.: + */ + if( ! html_symbol_name( p ) ) + { + *s++ = '&'; *s++ = 'a'; *s++ = 'm'; *s++ = 'p'; *s++ = ';'; ++p; + } + else + { + *s++ = *p++; + } + break; + + case '<': + *s++ = '&'; *s++ = 'l'; *s++ = 't'; *s++ = ';'; ++p; + break; + + case '>': + *s++ = '&'; *s++ = 'g'; *s++ = 't'; *s++ = ';'; ++p; + break; + + default: + *s++ = *p++; + break; + } + } + else + { + /* skip multiple spaces */ + if( *(p+1) != ' ' ) + *s++ = *p++; + else + ++p; + } + } + --s; *s = 0; + while( *(s-1) == ' ' ) --s; + *s = 0; + + (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string ); + + free( string ); + } + + /********************* + install into symtab + *********************/ + { + yylval.sym = install( NULL, STRING, (char *)token_utf8_buffer ); + } + + token_buffer[0] = 0; + token_utf8_buffer[0] = 0; + value = STRING; + goto done; + } + + case 0: + value = 1; + goto done; + break; + + case '{': + indent_level++; + value = c; + goto done; + break; + + case '}': + indent_level--; + value = c; + goto done; + break; + + default: + value = c; + goto done; + break; + + } /* End switch( c ) */ + + } /* End while( 1 ) */ + +done: + + return( value ); +} |