【背景】
需要用antlr实现C语言的预处理:
include,define等等内容。
参考了:
[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)
已经实现了部分的事情。
代码如下:
grammar preprocess; //lexer grammar preprocess; options{ language=Java; output = AST; } @lexer::header { //package com.mm.antlrv3demo; import java.io.*; import java.util.*; } @parser::header { //package com.mm.antlrv3demo; } @lexer::members { //public static TokenStreamSelector selector; // must be assigned externally protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true protected static List ifStates = new ArrayList(); // holds nested if conditions protected static Map defines = new Hashtable(); // holds the defines protected Map defineArgs = new Hashtable(); // holds the args for a macro call /* public void uponEOF() throws TokenStreamException, CharStreamException { try { selector.pop(); // return to old lexer/stream selector.retry(); } catch (NoSuchElementException e) { // return a real EOF if nothing in stack } } */ class SaveStruct { SaveStruct(CharStream input){ this.input = input; this.marker = input.mark(); } public CharStream input; public int marker; } Stack<SaveStruct> includes = new Stack<SaveStruct>(); // We should override this method for handling EOF of included file public Token nextToken(){ Token token = super.nextToken(); if(token.getType() == Token.EOF && !includes.empty()){ // We've got EOF and have non empty stack. SaveStruct ss = includes.pop(); setCharStream(ss.input); input.rewind(ss.marker); //this should be used instead of super [like below] to handle exits from nested includes //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token) token = this.nextToken(); } // Skip first token after switching on another input. // You need to use this rather than super as there may be nested include files if(((CommonToken)token).getStartIndex() < 0) token = this.nextToken(); return token; } } COMMENT : ('//' ~('\n'|'\r')* '\r'? '\n') {skip();} | ('/*' ( options {greedy=false;} : . )* '*/') {skip();} ; // and lexer rule INCLUDE : '#include' (WS)? f=STRING { String name = f.getText(); name = name.substring(1,name.length()-1); try { // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream(new ANTLRFileStream(name)); reset(); } catch(Exception fnf) { throw new Error("Cannot open file " + name); } }; /* fragment NON_CR_LF : ~('\r'|'\n'); fragment TAB_SPACE : (' ' | '\t'); */ //DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=STRING) //DIRECTIVE : ('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) ) fragment //MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n') //MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n') //MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n') MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*; //MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*; DIRECTIVE @init{ List args = new ArrayList(); boolean condition = true; } : ('#define' WS* defineMacro=RAW_IDENTIFIER ( ( '(' // get arguments if you find them (no spaces before left paren) (WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());} ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )* ')' | ' '|'\t'|'\f' ) ( options{greedy=true;}: ' '|'\t'|'\f' )* // store the text verbatim - tokenize when called defineText=MACRO_TEXT {args.set(0,defineText.getText());} )? '\n' { defines.put( defineMacro.getText(), args ); skip(); } ); IDENTIFIER @init{ List define = new ArrayList(); List args = new ArrayList(); } : identifier=RAW_IDENTIFIER { // see if this is a macro argument define = (List)defineArgs.get(identifier.getText()); } ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)? // take in arguments if macro call requires them '(' callArg0=EXPR {args.add(callArg0.getText());} ( COMMA callArg1=EXPR {args.add(callArg1.getText());} )* { args.size()==define.size()-1 }? // better have right amount ')' | { !((define!=null) && (define.size()>1)) }? ) { if (define!=null) { String defineText = (String)define.get(0); // create a new lexer to handle the macro text preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText))); for (int i=0;i<args.size();++i) { // treat macro arguments similar to local defines List arg = new ArrayList(); arg.add((String)args.get(i)); sublexer.defineArgs.put( (String)define.get(1+i), arg ); } selector.push(sublexer); // retry in new lexer selector.retry(); } }; fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ; NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long) // group symbols into categories to parse EXPR LEFT : '(' | '[' | '{' ; RIGHT : ')' | ']' | '}' ; COMMA : ',' ; OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ; fragment EXPR // allow just about anything without being ambiguous : (WS)? (NUMBER|IDENTIFIER)? ( ( LEFT EXPR ( COMMA EXPR )* RIGHT | STRING | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here ) EXPR )? ; //INT : '0'..'9'+ ; FLOAT : ('0'..'9')+ '.' ('0'..'9')* EXPONENT? | '.' ('0'..'9')+ EXPONENT? | ('0'..'9')+ EXPONENT ; WS : ( ' ' | '\t' | '\r' | '\n' ) {$channel=HIDDEN;} ; //RestSymbo : '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ; STRING : '"' ( ESC_SEQ | ~('\\'|'"') )* '"' ; CHAR: '\'' ( ESC_SEQ | ~('\''|'\\') ) '\'' ; fragment EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ; fragment HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ; fragment ESC_SEQ : '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\') | UNICODE_ESC | OCTAL_ESC ; fragment OCTAL_ESC : '\\' ('0'..'3') ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ('0'..'7') | '\\' ('0'..'7') ; fragment UNICODE_ESC : '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT ; header : include*; include : INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';
但是还是遇到很多问题,其中主要就是,针对于旧的antlr v2的TokenStreamSelector,如何换用成antlr v3的逻辑,用哪些函数和类替代。
【折腾过程】
1.关于预处理的问题,这人:
[antlr-interest] C PreProcessor Errors
也遇到类似的事情,但是对此处没啥帮助。
2.这里:
[antlr-interest] ANTLR 3 migration: TokenStreamSelector
和:
[antlr-interest] TokenStreamSelector + ANTLRv3
也提到了,v2转v3时,如何处理TokenStreamSelector,但是没人回答。
3.这里:
Tips on designing a preprocessor for C++ using Antlr
关于预处理,已经解释的很全了,但是还是antlr v2的版本,还是不能完全透彻的理解,还是无法找到TokenStreamSelector的替代品。
4.google搜:
antlr TokenStream Selector deprecated
看到了“Token Stream Multiplexing”,所以,去找找antlr作者写的书
The Definitive ANTLR Reference.pdf
看看其中关于此部分的解释,或许可以找到有价值的参考资料。
5.另外,顺便提示一句,上述代码中的那个:
testLiterals
实际上是antlr v2的语法
根据:
Migrating from ANTLR 2 to ANTLR 3
的某人评论,得知此testLiterals,antlr v3中也没了。
6.参考:
[antlr-interest] v3 – How to deal with include Files?
也讨论了类似问题,但是还是无解。
7.自己看代码,有一点点眉目了:
(1)antlr v2中的处理新的lexer(和tokenStream)的逻辑
public static TokenStreamSelector selector; // must be assigned externally protected static Map defines = new Hashtable(); // holds the defines public void uponEOF() throws TokenStreamException, CharStreamException { try { selector.pop(); // return to old lexer/stream selector.retry(); } ...... } : '#' ( "include" (WS)? includeFile:STRING { ...... try { cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name))); sublexer.defines = defines; // want defines to be persistent sublexer.setFilename(name); selector.push(sublexer); selector.retry(); } ...... } } ...... } else { // create a new lexer to handle the macro text cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText))); ...... selector.push(sublexer); // retry in new lexer selector.retry(); } }};
即,主要是:
用new cppLexer新建一个sublexer,
然后初始化一堆东西,比如:
给对应的给全局变量defines去赋值等等
然后就转到新的sublexer去处理了,调用方法是:
先push
再retry
而后,对于新的lexer,都有对应的uponEOF,
其中目的是遇到了EOF,要返回之前的(父级的lexer,所以
先去pop(返回到上一级,父级的lexer)
再去retry(相当于刷新,去使用当前的,父级的lexer)
(2)而与此相对应的,目前已经实现了,antlr v3的,处理新的lexer(和tokenStream)的代码是:
Stack<SaveStruct> includes = new Stack<SaveStruct>(); // We should override this method for handling EOF of included file public Token nextToken(){ ...... if(token.getType() == Token.EOF && !includes.empty()){ // We've got EOF and have non empty stack. SaveStruct ss = includes.pop(); setCharStream(ss.input); input.rewind(ss.marker); ...... } ...... } // and lexer rule INCLUDE : '#include' (WS)? f=STRING { ...... try { // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream(new ANTLRFileStream(name)); reset(); } ...... };
逻辑是:
也是,对于遇到了要include的文件,
类似于新的lexer
然后先去新建一个,全局的那个SaveStruct
将其保存起来,即push,即压栈
然后使用当前新的CharStream
然后用reset,使得回到文件最开始处,再重新处理
这样,就是:
先保存了旧的,父级的lexer(tokenStream)
然后用当前child级别的lexer去处理新的内容
处理完成后,即遇到了EOF
然后会在上面的nextToken中遇到
会去对于全局的变量includes,去pop,拿出来,之前保存的父级的lexer
然后通过setCharStream把后续要处理的内容拿出来
再通过input.rewind,定位到之前记录的位置,
就可以继续去处理了。
以此实现了递归的调用。
而基本明白了递归调用,递归处理父级和子级的lexer或tokenSteam,CharStream的逻辑后,
接下来,就可以,参考两者的不同之处,找到antlr v3中,如何去模拟此套逻辑了。
8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程,参见:
【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程
搞懂逻辑后,接下来,才是,如何将其转化为antlr v3版本的代码,实现同样的逻辑。
9.暂时写了如下代码:
fragment //MACRO_TEXT : ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n') //MACRO_TEXT : ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n') //MACRO_TEXT : ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n') //MACRO_TEXT : (('\\' '\r'? '\n') | (~('\r'|'\n')))*; //MACRO_TEXT : (('\\' '\r'? '\n') | (~('\n')))*; MACRO_TEXT : (('\\' '\n') | (~('\n')))*; //MACRO_TEXT : ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*; DIRECTIVE @init{ List args = new ArrayList(); boolean condition = true; String arg0Text = ""; String arg1Text = ""; String definedContent = ""; String defineId = ""; } : ('#define' WS* defineMacro=RAW_IDENTIFIER { args.add(""); // first element will hold the macro text } ( ( '(' // get arguments if you find them (no spaces before left paren) (WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);} ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )* ')' | ' '|'\t'|'\f' ) ( options{greedy=true;}: ' '|'\t'|'\f' )* // store the text verbatim - tokenize when called defineText=MACRO_TEXT { definedContent = defineText.getText(); args.set(0, definedContent); } )? '\n' { defineId = defineMacro.getText(); defines.put(defineId, args ); skip(); } ); IDENTIFIER @init{ List define = new ArrayList(); List foundArgs = new ArrayList(); String callArg0Text = ""; String callArg1Text = ""; } : identifier=RAW_IDENTIFIER { // see if this is a macro argument define = (List)defineArgs.get(identifier.getText()); if (define==null) { // see if this is a macro call define = (List)defines.get(identifier.getText()); } } ( { !((define!=null) && (define.size()>1)) }? | { (define!=null) && (define.size()>1) }? (WS|COMMENT)? // take in arguments if macro call requires them '(' callArg0=EXPR { callArg0Text = callArg0.getText(); foundArgs.add(callArg0Text); } ( COMMA callArg1=EXPR { callArg1Text = callArg1.getText(); foundArgs.add(callArg1Text); } )* { foundArgs.size()==define.size()-1 }? // better have right amount ')' ) { if (define!=null) { String defineText = (String)define.get(0); if (define.size()==1) { //only have one value in list -> the defineText is the define para content -> just need replace directly setText(defineText); } else { //add new dict pair: (para, call value) for (int i=0;i<foundArgs.size();++i) { // treat macro arguments similar to local defines List arg = new ArrayList(); arg.add((String)foundArgs.get(i)); defineArgs.put( (String)define.get(1+i), arg ); } // save current lexer's state SaveStruct ss = new SaveStruct(input); includes.push(ss); // switch on new input stream setCharStream(new ANTLRStringStream(defineText)); reset(); } } };
但是还没成功,且遇到一个问题:
10.
转载请注明:在路上 » 【记录】将antlr v2的C/C++的preprocess,即cpp.g,转换为antlr v3