summaryrefslogtreecommitdiff
path: root/cscmd/lex.c
diff options
context:
space:
mode:
Diffstat (limited to 'cscmd/lex.c')
-rw-r--r--cscmd/lex.c815
1 files changed, 815 insertions, 0 deletions
diff --git a/cscmd/lex.c b/cscmd/lex.c
new file mode 100644
index 0000000..318e074
--- /dev/null
+++ b/cscmd/lex.c
@@ -0,0 +1,815 @@
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdarg.h>
+#include <limits.h>
+#include <locale.h>
+#include <wchar.h>
+#include <wctype.h>
+
+#define PCRE2_CODE_UNIT_WIDTH 32
+#include <pcre2.h>
+
+#include <defs.h>
+
+#include <main.h>
+#include <error.h>
+#include <msglog.h>
+#include <xalloc.h>
+#include <utf8ing.h>
+#include <symtab.h>
+#include <parse.h>
+
+#include <lex.h>
+
+
+
+int lineno = 0;
+int colno = 0;
+
+static int maxtoken;
+static wchar_t *token_buffer;
+
+static int max8token;
+static utf8_t *token_utf8_buffer;
+
+int indent_level = 0; /* Number of '{' minus number of '}'. */
+
+static int end_of_file = 0;
+static int nextchar = -1;
+
+static char *locale;
+
+#define GETC(c) ({ wint_t ret; ++colno; ret = fgetwc( config ); ret; })
+#define UNGETC(c) ({ wint_t ret; --colno; ret = ungetwc( c, config ); ret; })
+
+
+static wchar_t *extend_token_buffer( wchar_t *p )
+{
+ int offset = p - token_buffer;
+ maxtoken = maxtoken * 2 + 10;
+ token_buffer = (wchar_t *)xrealloc( token_buffer, (maxtoken + 2)*sizeof(wchar_t) );
+
+ return( token_buffer + offset );
+}
+
+static utf8_t *extend_token_utf8_buffer( utf8_t *p )
+{
+ int offset = p - token_utf8_buffer;
+ max8token = max8token * 2 + 10;
+ token_utf8_buffer = (utf8_t *)xrealloc( token_utf8_buffer, (max8token + 2)*6 );
+
+ return( token_utf8_buffer + offset );
+}
+
+
+void yyerror( char const *s )
+{
+ error( "%s", s );
+}
+
+
+void init_lex( void )
+{
+ locale = setlocale( LC_ALL, "en_US.utf8" );
+
+ lineno = 0;
+ colno = 0;
+
+ nextchar = -1;
+ maxtoken = 40;
+ max8token = 40;
+
+ indent_level = 0;
+ end_of_file = 0;
+
+ token_buffer = (wchar_t *)xmalloc( maxtoken * sizeof(wchar_t) + 2 );
+ token_utf8_buffer = (utf8_t *)xmalloc( max8token * 6 + 2 );
+}
+
+void fini_lex( void )
+{
+ locale = setlocale( LC_ALL, locale );
+
+ if( token_buffer ) { free( token_buffer ); token_buffer = NULL; }
+ if( token_utf8_buffer ) { free( token_utf8_buffer ); token_utf8_buffer = NULL; }
+
+ indent_level = 0;
+ end_of_file = 0;
+
+ max8token = 0;
+ maxtoken = 0;
+ nextchar = -1;
+
+ lineno = 0;
+ colno = 0;
+}
+
+static wint_t check_newline( void )
+{
+ wint_t c;
+
+ ++lineno;
+ colno = 0; /* считает GETC()/UNGETC(); здесь надо только обнулить */
+
+ /*****************************************
+ Read first nonwhite char on the line.
+ *****************************************/
+ c = GETC();
+ while( c == ' ' || c == '\t' ) c = GETC();
+
+ if( c == '#' ) goto skipline;
+ else return( c );
+
+ /* skip the rest of this line */
+skipline:
+
+ while( c != '\n' && c != WEOF )
+ c = GETC();
+
+ return( c );
+}
+
+static wint_t skip_comment( int c )
+{
+ if( c == '*' )
+ {
+do1:
+ do
+ {
+ c = GETC();
+ if( c == '\n' ) { ++lineno; colno = 0; }
+
+ } while( c != '*' && c != WEOF );
+
+ if( c == WEOF )
+ {
+ unterminated_comment();
+ return( WEOF );
+ }
+
+ c = GETC();
+
+ if( c == '/' )
+ {
+ c = GETC();
+ if( c == '\n' ) c = check_newline();
+ return( c );
+ }
+ else
+ {
+ UNGETC( c );
+ goto do1;
+ }
+ }
+ else if( c == '/' || c == '#' )
+ {
+ do
+ {
+ c = GETC();
+
+ } while( c != '\n' && c != WEOF );
+
+ if( c == WEOF )
+ {
+ unterminated_comment();
+ return( WEOF );
+ }
+ else c = check_newline();
+
+ return( c );
+ }
+
+ return( c );
+
+} /* End skip_commemnt() */
+
+static wint_t skip_white_space( wint_t c )
+{
+ for( ;; )
+ {
+ switch( c )
+ {
+ case '\n':
+ c = check_newline();
+ break;
+
+ case '#':
+ c = skip_comment( c );
+ return( skip_white_space( c ) );
+ break;
+
+ case '/':
+ c = GETC();
+ if( c == '/' || c == '*' )
+ {
+ c = skip_comment( c );
+ return( skip_white_space( c ) );
+ }
+ else
+ {
+ UNGETC( c );
+ return( '/' );
+ }
+ break;
+
+ case ' ':
+ case '\t':
+ case '\f':
+ case '\v':
+ case '\b':
+ case '\r':
+ c = GETC();
+ break;
+ case '\\':
+ c = GETC();
+ if( c == '\n' ) { ++lineno; colno = 0; }
+ else
+ {
+ warning( "%s", "Stray '\\' in program" );
+ }
+ c = GETC();
+ break;
+ default:
+ return( c );
+
+ } /* End switch( c ) */
+
+ } /* End for( ;; ) */
+
+} /* End skip_white_space() */
+
+static wint_t readescape( int *ignore_ptr )
+/*
+ read escape sequence, returning a char, or store 1 in *ignore_ptr
+ if it is backslash-newline
+ */
+{
+ wint_t c = GETC();
+ wint_t code;
+ unsigned count;
+ unsigned firstdig = 0;
+ int nonull;
+
+ switch( c )
+ {
+ case 'x':
+ code = 0;
+ count = 0;
+ nonull = 0;
+ while( 1 )
+ {
+ c = GETC();
+ if( !(c >= 'a' && c <= 'f') &&
+ !(c >= 'A' && c <= 'F') &&
+ !(c >= '0' && c <= '9') )
+ {
+ UNGETC( c );
+ break;
+ }
+ code *= 16;
+ if( c >= 'a' && c <= 'f' ) code += c - 'a' + 10;
+ if( c >= 'A' && c <= 'F' ) code += c - 'A' + 10;
+ if( c >= '0' && c <= '9' ) code += c - '0';
+ if( code != 0 || count != 0 )
+ {
+ if( count == 0 ) firstdig = code;
+ count++;
+ }
+ nonull = 1;
+
+ } /* End while( 1 ) */
+
+ if( !nonull )
+ {
+ error( "%s", "\\x used with no following hex digits" );
+ }
+ else if( count == 0 )
+ /* Digits are all 0's. Ok. */
+ ;
+ else if( (count - 1) * 4 >= 32 || /* 32 == bits per INT */
+ (count > 1 && ((1 << (32 - (count-1) * 4)) <= firstdig )))
+ {
+ warning( "%s", "Hex escape out of range" );
+ }
+ return( code );
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7':
+ code = 0;
+ count = 0;
+ while( (c <= '7') && (c >= '0') && (count++ < 6) )
+ {
+ code = (code * 8) + (c - '0');
+ c = GETC();
+ }
+ UNGETC( c );
+ return( code );
+
+ case '\\': case '\'': case '"':
+ return( c );
+
+ case '\n':
+ lineno++; colno = 0;
+ *ignore_ptr = 1;
+ return( 0 );
+
+ case 'n':
+ return( '\n' );
+
+ case 't':
+ return( '\t' );
+
+ case 'r':
+ return( '\r' );
+
+ case 'f':
+ return( '\f' );
+
+ case 'b':
+ return( '\b' );
+
+ case 'a':
+ return( '\a' );
+
+ case 'v':
+ return( '\v' );
+ }
+
+ return( c );
+
+} /* End of readescape() */
+
+
+int html_symbol_name( wchar_t *str )
+{
+ int rc = 0, error = 0;
+ PCRE2_SIZE offset = 0;
+ wchar_t pattern[] = L"^(&[#A-Za-z0-9]*;)";
+
+ pcre2_match_data *match;
+
+ pcre2_code *regexp = pcre2_compile( (PCRE2_SPTR)pattern, PCRE2_ZERO_TERMINATED, 0, &error, &offset, NULL );
+ if( regexp == NULL )
+ {
+ return 0; /* PCRE compilation failed */
+ }
+
+ match = pcre2_match_data_create_from_pattern( regexp, NULL );
+
+ rc = pcre2_match( regexp, (PCRE2_SPTR)str, (int)wcslen(str), 0, 0, match, NULL );
+ if( rc < 0 )
+ {
+ /* not match */
+ pcre2_match_data_free( match );
+ pcre2_code_free( regexp );
+ return 0;
+ }
+ else
+ {
+ /* match */
+ pcre2_match_data_free( match );
+ pcre2_code_free( regexp );
+ return 1;
+ }
+}
+
+
+int yylex( void )
+{
+ wint_t c;
+ wchar_t *p;
+ int value;
+
+ if( nextchar >= 0 )
+ c = nextchar, nextchar = -1;
+ else
+ c = GETC();
+
+ while( 1 )
+ {
+ switch( c )
+ {
+ case ' ':
+ case '\t':
+ case '\f':
+ case '\v':
+ case '\b':
+ c = skip_white_space( c );
+ break;
+
+ case '\r':
+ case '\n':
+ case '/':
+ case '#':
+ case '\\':
+ c = skip_white_space( c );
+
+ default:
+ goto found_nonwhite;
+
+ } /* End switch( c ) */
+found_nonwhite:
+
+ token_buffer[0] = c;
+ token_buffer[1] = 0;
+
+ switch( c )
+ {
+ case WEOF:
+ end_of_file = 1;
+ token_buffer[0] = 0;
+ value = 0;
+ goto done;
+ break;
+
+ case '$': /* dollar in identifier */
+ if( 1 ) goto letter;
+ return '$';
+
+ case 'A': case 'B': case 'C': case 'D': case 'E':
+ case 'F': case 'G': case 'H': case 'I': case 'J':
+ case 'K': case 'L': case 'M': case 'N': case 'O':
+ case 'P': case 'Q': case 'R': case 'S': case 'T':
+ case 'U': case 'V': case 'W': case 'X': case 'Y':
+ case 'Z':
+ case 'a': case 'b': case 'c': case 'd': case 'e':
+ case 'f': case 'g': case 'h': case 'i': case 'j':
+ case 'k': case 'l': case 'm': case 'n': case 'o':
+ case 'p': case 'q': case 'r': case 's': case 't':
+ case 'u': case 'v': case 'w': case 'x': case 'y':
+ case 'z':
+ case '_':
+
+ /* RUSSIAN */
+ case L'А': case L'Б': case L'В': case L'Г': case L'Д':
+ case L'Е': case L'Ё': case L'Ж': case L'З': case L'И':
+ case L'Й': case L'К': case L'Л': case L'М': case L'Н':
+ case L'О': case L'П': case L'Р': case L'С': case L'Т':
+ case L'У': case L'Ф': case L'Х': case L'Ц': case L'Ч':
+ case L'Ш': case L'Щ': case L'Ъ': case L'Ы': case L'Ь':
+ case L'Э': case L'Ю': case L'Я':
+
+ case L'а': case L'б': case L'в': case L'г': case L'д':
+ case L'е': case L'ё': case L'ж': case L'з': case L'и':
+ case L'й': case L'к': case L'л': case L'м': case L'н':
+ case L'о': case L'п': case L'р': case L'с': case L'т':
+ case L'у': case L'ф': case L'х': case L'ц': case L'ч':
+ case L'ш': case L'щ': case L'ъ': case L'ы': case L'ь':
+ case L'э': case L'ю': case L'я':
+
+letter:
+ p = token_buffer;
+ while( iswalnum( c ) || c == '_' || c == '$' || c == '@' || c == '-' || c == '.' || c == ':' )
+ {
+ if( p >= token_buffer + maxtoken )
+ {
+ p = extend_token_buffer( p );
+ extend_token_utf8_buffer( token_utf8_buffer );
+ }
+
+ *p++ = c;
+ c = GETC();
+ }
+ *p = 0;
+ nextchar = c;
+ value = VARIABLE;
+
+ (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );
+
+ /*********************
+ install into symtab
+ *********************/
+ {
+ if( !strcmp( "section", (const char *)token_utf8_buffer ) )
+ {
+ value = SECTION;
+ yylval.sym = install( NULL, SECTION, NULL );
+ }
+ else if( !strcmp( "repo", (const char *)token_utf8_buffer ) )
+ {
+ value = REPO;
+ yylval.sym = install( NULL, REPO, NULL );
+ }
+ else
+ {
+ SYMBOL *sp = NULL;
+
+ if( (sp = lookup( (const char *)token_utf8_buffer )) == (SYMBOL *)0 )
+ sp = install( (const char *)token_utf8_buffer, VARIABLE, 0 );
+
+ /******************************************************************
+ Если переменная уже в таблице, то мы предполагаем, что она имеет
+ тип равный одному из допустимых: NUMERICAL, STRING, или PATH.
+ ******************************************************************/
+ if( sp->type != VARIABLE )
+ {
+ switch( sp->type )
+ {
+ case NUMERICAL:
+ case STRING:
+ case PATH:
+ value = sp->type;
+ break;
+ default:
+ /* error */
+ break;
+ }
+ }
+ yylval.sym = sp;
+ }
+ }
+
+ token_buffer[0] = 0;
+ token_utf8_buffer[0] = 0;
+ goto done;
+ break;
+
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ {
+ int constant = 0;
+/* integer: */
+ p = token_buffer;
+ while( iswdigit( c ) )
+ {
+ if( p >= token_buffer + maxtoken )
+ {
+ p = extend_token_buffer( p );
+ extend_token_utf8_buffer( token_utf8_buffer );
+ }
+
+ *p++ = c;
+ c = GETC();
+ }
+ *p = 0;
+ nextchar = c;
+ value = NUMERICAL;
+
+ (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)token_buffer );
+
+ /*********************
+ install into symtab
+ *********************/
+ {
+ (void)swscanf( (const wchar_t *)token_buffer, L"%d", &constant );
+ yylval.sym = install( NULL, NUMERICAL, constant );
+ }
+
+ token_buffer[0] = 0;
+ token_utf8_buffer[0] = 0;
+ goto done;
+ break;
+ }
+
+ case '\'':
+/* path_constant: */
+ {
+ int num_chars = 0;
+ unsigned int width = 8; /* to allow non asscii in path set width = 16 */
+
+ while( 1 )
+ {
+tryagain:
+ c = GETC();
+
+ if( c == '\'' || c == WEOF ) break;
+ if( c == '\\' )
+ {
+ int ignore = 0;
+ c = readescape( &ignore );
+ if( ignore ) goto tryagain;
+ if( (unsigned)c >= (1 << width) )
+ {
+ warning( "%s", "Escape sequence out of range" );
+ }
+ }
+ else if( c == '\n' ) { lineno++; colno = 0; }
+
+ num_chars++;
+ if( num_chars > maxtoken - 4 )
+ {
+ extend_token_buffer( token_buffer );
+ extend_token_utf8_buffer( token_utf8_buffer );
+ }
+
+ token_buffer[num_chars] = c;
+
+ } /* End while( 1 ) */
+
+ token_buffer[num_chars + 1] = '\'';
+ token_buffer[num_chars + 2] = 0;
+
+ if( c != '\'' )
+ {
+ error( "%s", "Malformated path constant" );
+ }
+ else if( num_chars == 0 )
+ {
+ error( "%s", "Empty path constant" );
+ }
+
+ /* build path: */
+ {
+ wchar_t *s, *string = NULL;
+ wchar_t *p = &token_buffer[0];
+
+ while( *p )
+ {
+ if( *p == '\n' || *p == '\t' ) *p = ' ';
+ ++p;
+ }
+
+ string = (wchar_t *)malloc( maxtoken * 4 + 10 );
+
+ p = &token_buffer[1];
+ s = &string[0];
+
+ while( *p == ' ' ) ++p;
+
+ while( *p )
+ {
+ if( *p != ' ' )
+ *s++ = *p++;
+ else
+ ++p;
+ }
+ --s; *s = 0;
+ while( *(s-1) == ' ' ) --s;
+ *s = 0;
+
+ (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );
+
+ free( string );
+ }
+
+ /*********************
+ install into symtab
+ *********************/
+ {
+ yylval.sym = install( NULL, PATH, (char *)token_utf8_buffer );
+ }
+
+ token_buffer[0] = 0;
+ token_utf8_buffer[0] = 0;
+ value = PATH;
+ goto done;
+ }
+
+ case '"':
+/* string_constant: */
+ {
+ c = GETC();
+ p = token_buffer + 1;
+
+ while( c != '"' && c >= 0 )
+ {
+ if( c == '\\' )
+ {
+ int ignore = 0;
+ c = readescape( &ignore );
+ if( ignore ) goto skipnewline;
+ }
+ else if( c == '\n' ) lineno++;
+
+ if( p == token_buffer + maxtoken )
+ {
+ p = extend_token_buffer( p );
+ extend_token_utf8_buffer( token_utf8_buffer );
+ }
+ *p++ = c;
+
+skipnewline:
+ c = GETC();
+
+ } /* End while( " ) */
+
+ *p = 0;
+
+ if( c < 0 )
+ {
+ error( "%s", "Unterminated string constant" );
+ }
+
+
+ *p++ = '"';
+ *p = 0;
+
+ /* build string: */
+ {
+ wchar_t *s, *string = NULL;
+ wchar_t *p = &token_buffer[0];
+
+ while( *p )
+ {
+ if( *p == '\n' || *p == '\t' ) *p = ' ';
+ ++p;
+ }
+
+ string = (wchar_t *)malloc( maxtoken * 4 + 10 );
+
+ p = &token_buffer[1];
+ s = &string[0];
+
+ while( *p == ' ' ) ++p;
+
+ while( *p )
+ {
+ if( *p != ' ' )
+ {
+ switch( *p )
+ {
+ case '&':
+ /************************************************
+ Skip HTML symbol names such as &nbsp,... etc.:
+ */
+ if( ! html_symbol_name( p ) )
+ {
+ *s++ = '&'; *s++ = 'a'; *s++ = 'm'; *s++ = 'p'; *s++ = ';'; ++p;
+ }
+ else
+ {
+ *s++ = *p++;
+ }
+ break;
+
+ case '<':
+ *s++ = '&'; *s++ = 'l'; *s++ = 't'; *s++ = ';'; ++p;
+ break;
+
+ case '>':
+ *s++ = '&'; *s++ = 'g'; *s++ = 't'; *s++ = ';'; ++p;
+ break;
+
+ default:
+ *s++ = *p++;
+ break;
+ }
+ }
+ else
+ {
+ /* skip multiple spaces */
+ if( *(p+1) != ' ' )
+ *s++ = *p++;
+ else
+ ++p;
+ }
+ }
+ --s; *s = 0;
+ while( *(s-1) == ' ' ) --s;
+ *s = 0;
+
+ (void)copy_ucs4_to_utf8( (utf8_t *)token_utf8_buffer, (const ucs4_t *)string );
+
+ free( string );
+ }
+
+ /*********************
+ install into symtab
+ *********************/
+ {
+ yylval.sym = install( NULL, STRING, (char *)token_utf8_buffer );
+ }
+
+ token_buffer[0] = 0;
+ token_utf8_buffer[0] = 0;
+ value = STRING;
+ goto done;
+ }
+
+ case 0:
+ value = 1;
+ goto done;
+ break;
+
+ case '{':
+ indent_level++;
+ value = c;
+ goto done;
+ break;
+
+ case '}':
+ indent_level--;
+ value = c;
+ goto done;
+ break;
+
+ default:
+ value = c;
+ goto done;
+ break;
+
+ } /* End switch( c ) */
+
+ } /* End while( 1 ) */
+
+done:
+
+ return( value );
+}