Main Page | Class Hierarchy | Alphabetical List | Class List | Directories | File List | Class Members | File Members

slexer.cpp

Go to the documentation of this file.
00001 // +-------------------------------------------------------------------------+
00002 // |               I__n__t__e__L__i__b           0.6.10 development          |
00003 // | Copyright (c) Andrey Vikt. Stolyarov <crocodil_AT_croco.net> 2000-2007. |
00004 // |                                                                         |
00005 // | This is free software. The library part is available under              |
00006 // |                               GNU LESSER GENERAL PUBLIC LICENSE v.2.1.  |
00007 // | GNU LGPL v2.1 is found in docs/gnu_gpl2.txt,  or at  http://www.gnu.org |
00008 // |     Please see also docs/readme.txt and visit http://www.intelib.org    |
00009 // |                                                                         |
00010 // | !!! THERE IS NO WARRANTY OF ANY KIND, NEITHER EXPRESSED NOR IMPLIED !!! |
00011 // +-------------------------------------------------------------------------+
00012 
00013 
00014 
00015 
00016 #include <string.h>
00017 #include <ctype.h>
00018 #include "slexer.hpp"
00019 
00020 #ifndef EOF
00021 #define EOF (-1)
00022 #endif
00023 
00024 extern SReference Charp2LispNumber(const char* p);
00025 
00026 SLabel LexemaEof("#<END OF FILE>");
00027 
00028 IntelibSLexAnalyser::IntelibSLexAnalyser()
00029     : specchars(0), special_state(0), state(home),
00030       postponed_char(-1), line(1), error_message(0)
00031 {}
00032 
00033 IntelibSLexAnalyser::~IntelibSLexAnalyser()
00034 {
00035     while(specchars) {
00036         SpecChar *tmp = specchars->next;
00037         delete specchars;
00038         specchars = tmp;
00039     }
00040 }
00041 
00042 bool IntelibSLexAnalyser::AddDelimiter(const char *prefix,
00043                                        const SReference &token)
00044 {
00045     SpecChar *spc = AddSpecial(prefix, false);
00046     if(!spc) return false;
00047     delimiter_chars += prefix[0];
00048     spc->token = token;
00049     return true;
00050 }
00051 
00052 bool IntelibSLexAnalyser::AddNonDelimiter(const char *prefix,
00053                                           const SReference &token)
00054 {
00055     SpecChar *spc = AddSpecial(prefix, false);
00056     if(!spc) return false;
00057     spc->token = token;
00058     spc->status = SpecChar::non_delim;
00059     return true;
00060 }
00061 
00062 bool IntelibSLexAnalyser::AddTokenStarter(const char *prefix,
00063                                           SReference (*fun)(const char *))
00064 {
00065     SpecChar *spc = AddSpecial(prefix, true);
00066     if(!spc) return false;
00067     // in this case, we don't put the first char into delimiter_chars
00068     spc->makestring = fun;
00069     return true;
00070 }
00071 
00072 bool IntelibSLexAnalyser::AddStringStarter(const char *prefix,
00073                                            int closer_char,
00074                                            SReference (*fun)(const char *))
00075 {
00076     SpecChar *spc = AddSpecial(prefix, true);
00077     if(!spc) return false;
00078     delimiter_chars += prefix[0];
00079     spc->status = SpecChar::read_string;
00080     spc->closer[0] = closer_char;
00081     spc->closer[1] = 0;
00082     spc->makestring = fun;
00083     return true;
00084 }
00085 
00086 bool IntelibSLexAnalyser::AddCommentStarter(const char *prefix,
00087                                             const char *closer)
00088 {
00089     SpecChar *spc = AddSpecial(prefix, true);
00090     if(!spc) return false;
00091     delimiter_chars += prefix[0];
00092     spc->status = SpecChar::ignore_until;
00093     for(unsigned int i=0; i<sizeof(spc->closer)-1; i++) {
00094         if((spc->closer[i] = closer[i]) == 0) return true;
00095     }
00096 #if INTELIB_RUNTIME_CHECKS == 1
00097     error_message = "comment closer is too long";
00098     return false; 
00099 #else
00100     spc->closer[sizeof(spc->closer)-1] = 0;
00101 #endif
00102 }
00103 
00104 IntelibSLexAnalyser::SpecChar*
00105 IntelibSLexAnalyser::DoAddSpecial(SpecChar **p, const char *str, bool ex)
00106 {
00107     // str can't be empty here! that is, str[0] is definitely a char
00108     // let's see if the char is at this level
00109     if(!*p) {
00110         // we must place it right here!
00111         *p = new SpecChar;
00112         (*p)->ch = str[0];
00113         (*p)->sub = 0;
00114         (*p)->next = 0;
00115         if(str[1]) { // there must be a subtree
00116             (*p)->status = SpecChar::non_term;
00117             return DoAddSpecial(&((*p)->sub), str+1, ex);
00118         } else { // no subtree, terminate here
00119             (*p)->status = ex ? SpecChar::read_rest : SpecChar::term;
00120             return *p;
00121         }
00122     } else {
00123         // okay, the level exists (or we're actually traversing it
00124         // let's see what char is there
00125         if((*p)->ch == str[0]) {
00126             // here is it!
00127             if(str[1]) { // there must be a subtree
00128                 if((*p)->status == SpecChar::read_rest) {
00129                     error_message =
00130                         "Previously-added extendable special lexem is "
00131                         "a strict prefix of the new lexem";
00132                     return 0;
00133                 }
00134                 return DoAddSpecial(&((*p)->sub), str+1, ex);
00135             } else { // no subtree, terminate here
00136                 if((*p)->status != SpecChar::non_term) {
00137                     // lexem already exists!
00138                     error_message = "duplicate or conflicting special lexems";
00139                     return 0;
00140                 }
00141                 (*p)->status = ex ? SpecChar::read_rest : SpecChar::term;
00142                 return *p;
00143             }
00144         } else {
00145             // just try the next position
00146             return DoAddSpecial(&((*p)->next), str, ex);
00147         }
00148     }
00149 }
00150 
00151 IntelibSLexAnalyser::SpecChar*
00152 IntelibSLexAnalyser::AddSpecial(const char *str, bool extendable)
00153 {
00154     if(!*str) {
00155         error_message = "can't add empty special lexem";
00156         return false;
00157     }
00158     return DoAddSpecial(&specchars, str, extendable);
00159 }
00160 
00161 SReference IntelibSLexAnalyser::Get() const
00162 {
00163     return SReference(lex, lexline);
00164 }
00165 
00166 void IntelibSLexAnalyser::Drop() { 
00167     state = home;
00168     buf = "";
00169     lex = SReference();
00170     error_message = 0;
00171 }
00172 
00173 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::FeedChar(int c)
00174 {
00175     if(postponed_char != -1) {
00176         int pp = postponed_char;
00177         postponed_char = -1;
00178         FeedResult res = FeedChar(pp);
00179         switch(res) {
00180             case res_eof:
00181             case res_error:
00182             case res_ready:
00183                 postponed_char = c;
00184                 return res;
00185             case res_empty: 
00186             case res_continue:
00187                 return FeedChar(c);
00188             default:
00189                 return res_error;
00190         }
00191         throw IntelibX_bug();
00192     }
00193     if(c == '\n') line++;
00194     switch(state) {
00195         case home:
00196             return Home(c);
00197         case special:
00198         case special2:
00199             return Special(c);
00200         case special3:
00201             return Special3(c);
00202         case string:
00203             return String(c);
00204         case stringq:
00205             return Stringq(c);
00206         case token:
00207             return Token(c);
00208         case token_force:
00209             buf+=c;
00210             state = token;
00211             return res_continue;
00212         case comment:
00213 #if 0
00214             if(c == EOF) {
00215                 state = home;
00216                 return res_eof;
00217             }
00218             if(c == '\n')
00219                 state = home;
00220             return res_empty;
00221 #endif
00222             return Comment(c);
00223         default:
00224             throw IntelibX_bug();
00225     }
00226 }
00227 
00228 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Home(int c)
00229 {
00230     lexline = line;
00231     switch(c) {
00232         case EOF: 
00233             return res_eof;
00234         case ' ':
00235         case '\n':
00236         case '\t':
00237         case '\r':
00238         case '\f':
00239         case '\v':
00240             return res_continue;
00241         default:
00242             buf = "";
00243             // check whether it's special
00244             special_state = &specchars;
00245             FeedResult res = Special(c);
00246             if(res != res_error) {
00247                 return res;
00248             }
00249             // being here means it isn't special
00250             buf += c;
00251             state = token;
00252             string_finalizer = 0;
00253             return res_continue;
00254     }
00255 }
00256 
00257 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Special(int c)
00258 {
00259     if(c == EOF) {
00260         error_message = "eof within or right after a special lexem";
00261         return res_error;
00262     }
00263     for(SpecChar *p = *special_state; p; p = p->next) {
00264         if(p->ch == c) {
00265             buf += c;
00266             if(p->status == SpecChar::term) {
00267                 if(p->sub) { // could be longer
00268                     state = special2;
00269                     lex = p->token;
00270                     special_state = &(p->sub);
00271                     return res_continue;
00272                 } else {
00273                     lex = p->token;
00274                     buf = "";
00275                     state = home;
00276                     return res_ready; 
00277                 }
00278             }
00279             if(p->status == SpecChar::non_delim) {
00280                 state = special3;
00281                 lex = p->token;
00282                 special_state = &(p->sub);
00283                 return res_continue;
00284             }
00285             if(p->status == SpecChar::read_rest) {
00286                 state = token_force;
00287                 buf = "";
00288                 string_finalizer = p->makestring;
00289                 return res_continue; 
00290             }
00291             if(p->status == SpecChar::read_string) {
00292                 state = string;
00293                 buf = "";
00294                 closer = p->closer;
00295                 string_finalizer = p->makestring;
00296                 return res_continue; 
00297             }
00298             if(p->status == SpecChar::ignore_until) {
00299                 state = comment;
00300                 buf = "";
00301                 closer = p->closer;
00302                 comment_closer_index = 0;
00303                 return res_continue; 
00304             }
00305             if(!p->sub) // it's impossible for non_term
00306                 throw IntelibX_bug();
00307             state = special;
00308             special_state = &(p->sub);
00309             return res_continue;
00310         }
00311     }
00312     if(state == special2) {
00313         // this means the previous char was actually the last of the lexem
00314         // this->lex is already assigned, by the way
00315         buf = "";
00316         state = home;
00317         postponed_char = c;
00318         return res_ready; 
00319     } else if(state == special3) {
00320         // this means that, well, it looks like a regular token
00321         state = token;
00322         return res_continue;
00323     } else {
00324         error_message = "unexpected special lexem";
00325         return res_error;
00326     }
00327 }
00328 
00329 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Special3(int c)
00330 {
00331     if(c==EOF || IsDelimiterChar(c) || isspace(c))
00332     {
00333         state = home;
00334         buf = "";
00335         if(!isspace(c)) {
00336             postponed_char = c;
00337         }
00338         return res_ready;
00339     } else {
00340         return Special(c);
00341     }
00342 }
00343 
00344 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::String(int c)
00345 {
00346     switch(c) {
00347         case EOF:
00348             error_message = "eof in string";
00349             return res_error;
00350         case '\\':
00351             state = stringq;
00352             return res_continue;
00353         default:
00354             if(c == closer[0]) {
00355                 lex = string_finalizer ?
00356                     string_finalizer(buf.c_str()) : SReference(buf);
00357                 buf = "";
00358                 state = home;
00359                 return res_ready;
00360             } else {
00361                 buf += c;
00362                 return res_continue;
00363             }
00364     }
00365 }
00366 
00367 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Stringq(int c)
00368 {
00369     switch(c) {
00370         case EOF:
00371             error_message = "quoted eof";
00372             return res_error;
00373         case 'a': buf += '\a'; break;
00374         case 'b': buf += '\b'; break;
00375         case 'f': buf += '\f'; break;
00376         case 'n': buf += '\n'; break;
00377         case 'r': buf += '\r'; break;
00378         case 't': buf += '\t'; break;
00379         case 'v': buf += '\v'; break;
00380              // No, codes like \012 and \0xA are not implemented. Sorry.
00381              // Do you need them? If so, you're encouraged to
00382              // do this job and contribute it to me... or just
00383              // let me know, heh. In fact, the thing will require
00384              // at least one more state in this machine.
00385         default:
00386             buf += c;
00387     }
00388     state = string;
00389     return res_continue;
00390 }
00391 
00392 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Token(int c)
00393 {
00394     if(c==EOF || IsDelimiterChar(c) || isspace(c))
00395     {
00396         state = home;
00397         if(!isspace(c)) {
00398             postponed_char = c;
00399         }
00400         lex = string_finalizer ?
00401             string_finalizer(buf.c_str()) : ProcessToken(buf.c_str());
00402         buf = "";
00403         if(!lex.GetPtr()) {
00404             return res_error;
00405         }
00406         return res_ready;
00407     } else {
00408         buf += c;
00409         return res_continue;
00410     }
00411 }
00412 
00413 IntelibSLexAnalyser::FeedResult IntelibSLexAnalyser::Comment(int c)
00414 {
00415     if(closer[comment_closer_index] == c) {
00416         comment_closer_index++;
00417         if(closer[comment_closer_index] == 0) {
00418             // end of comment
00419             state = home;
00420             return res_empty;
00421         }
00422     } else {
00423         comment_closer_index = 0;
00424     }
00425     return res_continue;
00426 }
00427 
00428 static bool IsNumber(const char *s, bool sign_ok = true, bool dot_ok = true)
00429 {
00430     if(!s || s[0] == 0) {
00431         return true;
00432     } else if(isdigit(s[0])) {
00433         return IsNumber(s+1, false, dot_ok);
00434     } else if(dot_ok && s[0] == '.') {
00435         return IsNumber(s+1, false, false);
00436     } else if(sign_ok && (s[0] == '-' || s[0] == '+') && s[1] != 0) {
00437         return IsNumber(s+1, false, dot_ok);
00438     } else 
00439         return false;
00440 }
00441 
00442 SReference IntelibSLexAnalyser::ProcessToken(const char *s)
00443 {
00444 #if 0
00445     static const struct {
00446         const char *name;
00447         char val;
00448     } charnames[] = {
00449         { "NEWLINE", '\n' },
00450         { "SPACE", ' ' },
00451         { "TAB", '\t' },
00452         { "BACKSPACE", '\010' },
00453         { "LINEFEED", '\012' },
00454         { "PAGE", '\014' },
00455         { "RETURN", '\015' },
00456         { "RUBOUT", '\177' },
00457         { 0, 0 }
00458     };
00459     if(s[0] == '.' && s[1] == 0)
00460         return LexemaDot;
00461     if(s[0] == '#') {
00462         if(s[1] == '\\') {
00463             if(s[3] == '\0') {
00464                 // single char, such as #\a
00465                 return SReference(s[2]);
00466             } else {
00467                 int i;
00468                 for(i = 0; charnames[i].name; i++) {
00469                     if(strcasecmp(s+2, charnames[i].name) == 0)
00470                         return SReference(charnames[i].val);
00471                 }
00472                 error_message = "invalid character name";
00473                 return SReference(); // error
00474             }
00475         } else {
00476             // just return it as it is, it might be another special
00477             return SReference(new SExpressionClassicAtom(s));
00478         }
00479     } else
00480 #endif
00481     if(IsNumber(s)) {
00482         return Charp2LispNumber(s);
00483     } else {
00484         return SReference(new SExpressionClassicAtom(s));
00485     }
00486 }
00487 
00488 bool IntelibSLexAnalyser::IsDelimiterChar(int ch)
00489 {
00490     for(const char *tmp = delimiter_chars.c_str(); *tmp; tmp++)
00491         if(ch == (int)*tmp) return true;
00492     return false;
00493 }

Generated on Tue Dec 18 00:39:45 2007 for InteLib by  doxygen 1.4.1