【背景】
需要用antlr实现C语言的预处理:
include,define等等内容。
参考了:
[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)
已经实现了部分的事情。
代码如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | grammar preprocess; //lexer grammar preprocess; options{ language=Java; output = AST; } @lexer::header { //package com.mm.antlrv3demo; import java.io.*; import java.util.*; } @parser::header { //package com.mm.antlrv3demo; } @lexer::members { //public static TokenStreamSelector selector; // must be assigned externally protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true protected static List ifStates = new ArrayList(); // holds nested if conditions protected static Map defines = new Hashtable(); // holds the defines protected Map defineArgs = new Hashtable(); // holds the args for a macro call /* public void uponEOF() throws TokenStreamException, CharStreamException { try { selector.pop(); // return to old lexer/stream selector.retry(); } catch (NoSuchElementException e) { // return a real EOF if nothing in stack } } */ class SaveStruct { SaveStruct(CharStream input){ this.input = input; this.marker = input.mark(); } public CharStream input; public int marker; } Stack<SaveStruct> includes = new Stack<SaveStruct>(); // We should override this method for handling EOF of included file public Token nextToken(){ Token token = super.nextToken(); if(token.getType() == Token.EOF && !includes.empty()){ // We've got EOF and have non empty stack. SaveStruct ss = includes.pop(); setCharStream(ss.input); input.rewind(ss.marker); //this should be used instead of super [like below] to handle exits from nested includes //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token) token = this.nextToken(); } // Skip first token after switching on another input. // You need to use this rather than super as there may be nested include files if(((CommonToken)token).getStartIndex() < 0) token = this.nextToken(); return token; } } COMMENT : ('//' ~('\n'|'\r')* '\r'? '\n') {skip();} | ('/*' ( options {greedy=false;} : . )* '*/') {skip();} ; // and lexer rule INCLUDE : '#include' (WS)? f=STRING { String name = f.getText(); name = name.substring(1,name.length()-1); try { // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream(new ANTLRFileStream(name)); reset(); } catch(Exception fnf) { throw new Error("Cannot open file " + name); } }; /* fragment NON_CR_LF : ~('\r'|'\n'); fragment TAB_SPACE : (' ' | '\t'); */ //DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=STRING) //DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) ) fragment //MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n') //MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n') //MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n') MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*; //MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*; DIRECTIVE @init{ List args = new ArrayList(); boolean condition = true; } : ('#define' WS* defineMacro=RAW_IDENTIFIER ( ( '(' // get arguments if you find them (no spaces before left paren) (WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());} ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )* ')' | ' '|'\t'|'\f' ) ( options{greedy=true;}: ' '|'\t'|'\f' )* // store the text verbatim - tokenize when called defineText=MACRO_TEXT {args.set(0,defineText.getText());} )? '\n' { defines.put( defineMacro.getText(), args ); skip(); } ); IDENTIFIER @init{ List define = new ArrayList(); List args = new ArrayList(); } : identifier=RAW_IDENTIFIER { // see if this is a macro argument define = (List)defineArgs.get(identifier.getText()); } ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)? // take in arguments if macro call requires them '(' callArg0=EXPR {args.add(callArg0.getText());} ( COMMA callArg1=EXPR {args.add(callArg1.getText());} )* { args.size()==define.size()-1 }? // better have right amount ')' | { !((define!=null) && (define.size()>1)) }? ) { if (define!=null) { String defineText = (String)define.get(0); // create a new lexer to handle the macro text preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText))); for (int i=0;i<args.size();++i) { // treat macro arguments similar to local defines List arg = new ArrayList(); arg.add((String)args.get(i)); sublexer.defineArgs.put( (String)define.get(1+i), arg ); } selector.push(sublexer); // retry in new lexer selector.retry(); } }; fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ; NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long) // group symbols into categories to parse EXPR LEFT : '(' | '[' | '{' ; RIGHT : ')' | ']' | '}' ; COMMA : ',' ; OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ; fragment EXPR // allow just about anything without being ambiguous : (WS)? (NUMBER|IDENTIFIER)? ( ( LEFT EXPR ( COMMA EXPR )* RIGHT | STRING | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here ) EXPR )? ; //INT : '0'..'9'+ ; FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? | '.' ('0'..'9')+ EXPONENT? | ('0'..'9')+ EXPONENT ; WS : ( ' ' | '\t' | '\r' | '\n' ) {$channel=HIDDEN;} ; //RestSymbo : '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ; STRING : '"' ( ESC_SEQ | ~('\\'|'"') )* '"' ; CHAR: '\'' ( ESC_SEQ | ~('\''|'\\') ) '\'' ; fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; fragment HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; fragment ESC_SEQ : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') | UNICODE_ESC | OCTAL_ESC ; fragment OCTAL_ESC : '\\' ('0'..'3') ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ; fragment UNICODE_ESC : '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ; header : include*; include : INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>'; |
但是还是遇到很多问题,其中主要就是,针对于旧的antlr v2的TokenStreamSelector,如何换用成antlr v3的逻辑,用哪些函数和类替代。
【折腾过程】
1.关于预处理的问题,这人:
[antlr-interest] C PreProcessor Errors
也遇到类似的事情,但是对此处没啥帮助。
2.这里:
[antlr-interest] ANTLR 3 migration: TokenStreamSelector
和:
[antlr-interest] TokenStreamSelector + ANTLRv3
也提到了,v2转v3时,如何处理TokenStreamSelector,但是没人回答。
3.这里:
Tips on designing a preprocessor for C++ using Antlr
关于预处理,已经解释的很全了,但是还是antlr v2的版本,还是不能完全透彻的理解,还是无法找到TokenStreamSelector的替代品。
4.google搜:
antlr TokenStream Selector deprecated
看到了“Token Stream Multiplexing”,所以,去找找antlr作者写的书
The Definitive ANTLR Reference.pdf
看看其中关于此部分的解释,或许可以找到有价值的参考资料。
5.另外,顺便提示一句,上述代码中的那个:
testLiterals
实际上是antlr v2的语法
根据:
Migrating from ANTLR 2 to ANTLR 3
的某人评论,得知此testLiterals,antlr v3中也没了。
6.参考:
[antlr-interest] v3 – How to deal with include Files?
也讨论了类似问题,但是还是无解。
7.自己看代码,有一点点眉目了:
(1)antlr v2中的处理新的lexer(和tokenStream)的逻辑
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | public static TokenStreamSelector selector; // must be assigned externally protected static Map defines = new Hashtable(); // holds the defines public void uponEOF() throws TokenStreamException, CharStreamException { try { selector.pop(); // return to old lexer/stream selector.retry(); } ...... } : '#' ( "include" (WS)? includeFile:STRING { ...... try { cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name))); sublexer.defines = defines; // want defines to be persistent sublexer.setFilename(name); selector.push(sublexer); selector.retry(); } ...... } } ...... } else { // create a new lexer to handle the macro text cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText))); ...... selector.push(sublexer); // retry in new lexer selector.retry(); } }}; |
即,主要是:
用new cppLexer新建一个sublexer,
然后初始化一堆东西,比如:
给对应的给全局变量defines去赋值等等
然后就转到新的sublexer去处理了,调用方法是:
先push
再retry
而后,对于新的lexer,都有对应的uponEOF,
其中目的是遇到了EOF,要返回之前的(父级的lexer,所以
先去pop(返回到上一级,父级的lexer)
再去retry(相当于刷新,去使用当前的,父级的lexer)
(2)而与此相对应的,目前已经实现了,antlr v3的,处理新的lexer(和tokenStream)的代码是:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | Stack<SaveStruct> includes = new Stack<SaveStruct>(); // We should override this method for handling EOF of included file public Token nextToken(){ ...... if(token.getType() == Token.EOF && !includes.empty()){ // We've got EOF and have non empty stack. SaveStruct ss = includes.pop(); setCharStream(ss.input); input.rewind(ss.marker); ...... } ...... } // and lexer rule INCLUDE : '#include' (WS)? f=STRING { ...... try { // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream(new ANTLRFileStream(name)); reset(); } ...... }; |
逻辑是:
也是,对于遇到了要include的文件,
类似于新的lexer
然后先去新建一个,全局的那个SaveStruct
将其保存起来,即push,即压栈
然后使用当前新的CharStream
然后用reset,使得回到文件最开始处,再重新处理
这样,就是:
先保存了旧的,父级的lexer(tokenStream)
然后用当前child级别的lexer去处理新的内容
处理完成后,即遇到了EOF
然后会在上面的nextToken中遇到
会去对于全局的变量includes,去pop,拿出来,之前保存的父级的lexer
然后通过setCharStream把后续要处理的内容拿出来
再通过input.rewind,定位到之前记录的位置,
就可以继续去处理了。
以此实现了递归的调用。
而基本明白了递归调用,递归处理父级和子级的lexer或tokenSteam,CharStream的逻辑后,
接下来,就可以,参考两者的不同之处,找到antlr v3中,如何去模拟此套逻辑了。
8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程,参见:
【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程
搞懂逻辑后,接下来,才是,如何将其转化为antlr v3版本的代码,实现同样的逻辑。
9.暂时写了如下代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | fragment //MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n') //MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n') //MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n') //MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*; //MACRO_TEXT : (('\\' '\r'? '\n') | (~('\n')))*; MACRO_TEXT : (( '\\' '\n' ) | (~( '\n' )))*; //MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*; DIRECTIVE @init{ List args = new ArrayList(); boolean condition = true ; String arg0Text = "" ; String arg1Text = "" ; String definedContent = "" ; String defineId = "" ; } : ( '#define' WS* defineMacro=RAW_IDENTIFIER { args.add( "" ); // first element will hold the macro text } ( ( '(' // get arguments if you find them (no spaces before left paren) (WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);} ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )* ')' | ' ' | '\t' | '\f' ) ( options{greedy= true ;}: ' ' | '\t' | '\f' )* // store the text verbatim - tokenize when called defineText=MACRO_TEXT { definedContent = defineText.getText(); args.set(0, definedContent); } )? '\n' { defineId = defineMacro.getText(); defines.put(defineId, args ); skip(); } ); IDENTIFIER @init{ List define = new ArrayList(); List foundArgs = new ArrayList(); String callArg0Text = "" ; String callArg1Text = "" ; } : identifier=RAW_IDENTIFIER { // see if this is a macro argument define = (List)defineArgs.get(identifier.getText()); if (define== null ) { // see if this is a macro call define = (List)defines.get(identifier.getText()); } } ( { !((define!= null ) && (define.size()>1)) }? | { (define!= null ) && (define.size()>1) }? (WS|COMMENT)? // take in arguments if macro call requires them '(' callArg0=EXPR { callArg0Text = callArg0.getText(); foundArgs.add(callArg0Text); } ( COMMA callArg1=EXPR { callArg1Text = callArg1.getText(); foundArgs.add(callArg1Text); } )* { foundArgs.size()==define.size()-1 }? // better have right amount ')' ) { if (define!= null ) { String defineText = (String)define.get(0); if (define.size()==1) { //only have one value in list -> the defineText is the define para content -> just need replace directly setText(defineText); } else { //add new dict pair: (para, call value) for (int i=0;i<foundArgs.size();++i) { // treat macro arguments similar to local defines List arg = new ArrayList(); arg.add((String)foundArgs.get(i)); defineArgs.put( (String)define.get(1+i), arg ); } // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream( new ANTLRStringStream(defineText)); reset(); } } }; |
但是还没成功,且遇到一个问题:
10.
转载请注明:在路上 » 【记录】将antlr v2的C/C++的preprocess,即cpp.g,转换为antlr v3