【记录】将antlr v2的C/C++的preprocess，即cpp.g，转换为antlr v3

【背景】

需要用antlr实现C语言的预处理：

include，define等等内容。

参考了：

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

已经实现了部分的事情。

代码如下：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

grammar preprocess;
//lexer grammar preprocess;
 
options{
    language=Java;
    output = AST;
}
 
@lexer::header {
//package com.mm.antlrv3demo;
 
import java.io.*;
import java.util.*;
}
 
@parser::header {
//package com.mm.antlrv3demo;
}
 
@lexer::members {
    //public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    /*
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
    */
     
    class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
  
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
  
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
  
       return token;
     }
}
 
COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
/*
fragment
NON_CR_LF   :   ~('\r'|'\n');
 
fragment
TAB_SPACE
    :   (' ' | '\t');
*/
 
//DIRECTIVE     :   ('#define' WS* defineMacro=ID WS* defineText=STRING)
//DIRECTIVE     :   ('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) )
 
 
fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
 
DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
}   :   ('#define' WS* defineMacro=RAW_IDENTIFIER
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n'
    {
        defines.put( defineMacro.getText(), args );
        skip();
    }
    );
 
IDENTIFIER @init{
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1=EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{
if (define!=null) {
    String defineText = (String)define.get(0);
 
    // create a new lexer to handle the macro text
    preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText)));
    for (int i=0;i<args.size();++i) {
        // treat macro arguments similar to local defines
        List arg = new ArrayList();
        arg.add((String)args.get(i));
        sublexer.defineArgs.put( (String)define.get(1+i), arg );
    }
    selector.push(sublexer);
    // retry in new lexer
    selector.retry();
 
}
};
 
fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;
 
NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long)
 
// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;
 
 
fragment EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;
 
//INT : '0'..'9'+    ;
 
FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;
 
WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;
 
//RestSymbo :   '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;
 
STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;
 
CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;
 
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
 
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
 
fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;
 
fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;
 
fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
     
header
    :   include*;
include :   INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

但是还是遇到很多问题，其中主要就是，针对于旧的antlr v2的TokenStreamSelector，如何换用成antlr v3的逻辑，用哪些函数和类替代。

【折腾过程】

1.关于预处理的问题，这人：

[antlr-interest] C PreProcessor Errors

也遇到类似的事情，但是对此处没啥帮助。

2.这里：

[antlr-interest] ANTLR 3 migration: TokenStreamSelector

和：

[antlr-interest] TokenStreamSelector + ANTLRv3

也提到了，v2转v3时，如何处理TokenStreamSelector，但是没人回答。

3.这里：

Tips on designing a preprocessor for C++ using Antlr

关于预处理，已经解释的很全了，但是还是antlr v2的版本，还是不能完全透彻的理解，还是无法找到TokenStreamSelector的替代品。

4.google搜：

antlr TokenStream Selector deprecated

看到了“Token Stream Multiplexing”，所以，去找找antlr作者写的书

The Definitive ANTLR Reference.pdf

看看其中关于此部分的解释，或许可以找到有价值的参考资料。

5.另外，顺便提示一句，上述代码中的那个：

testLiterals

实际上是antlr v2的语法

根据：

Migrating from ANTLR 2 to ANTLR 3

的某人评论，得知此testLiterals，antlr v3中也没了。

6.参考：

[antlr-interest] v3 – How to deal with include Files?

也讨论了类似问题，但是还是无解。

7.自己看代码，有一点点眉目了：

（1）antlr v2中的处理新的lexer（和tokenStream）的逻辑

    public static TokenStreamSelector selector; // must be assigned externally
    protected static Map defines = new Hashtable(); // holds the defines
     
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        }
        ......
    }
     
 
: '#'
    ( "include" (WS)? includeFile:STRING {
            ......
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            }
            ......
        }
    }
......
 
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
         
        ......
         
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};

即，主要是：

用new cppLexer新建一个sublexer，

然后初始化一堆东西，比如：

给对应的给全局变量defines去赋值等等

然后就转到新的sublexer去处理了，调用方法是：

先push

再retry

而后，对于新的lexer，都有对应的uponEOF，

其中目的是遇到了EOF，要返回之前的（父级的lexer，所以

先去pop（返回到上一级，父级的lexer）

再去retry（相当于刷新，去使用当前的，父级的lexer）

（2）而与此相对应的，目前已经实现了，antlr v3的，处理新的lexer（和tokenStream）的代码是：

     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       ......
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
          
         ......
       }
  
      ......
     }
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    ......
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    }
    ......
};

逻辑是：

也是，对于遇到了要include的文件，

类似于新的lexer

然后先去新建一个，全局的那个SaveStruct

将其保存起来，即push，即压栈

然后使用当前新的CharStream

然后用reset，使得回到文件最开始处，再重新处理

这样，就是：

先保存了旧的，父级的lexer（tokenStream）

然后用当前child级别的lexer去处理新的内容

处理完成后，即遇到了EOF

然后会在上面的nextToken中遇到

会去对于全局的变量includes，去pop，拿出来，之前保存的父级的lexer

然后通过setCharStream把后续要处理的内容拿出来

再通过input.rewind，定位到之前记录的位置，

就可以继续去处理了。

以此实现了递归的调用。

而基本明白了递归调用，递归处理父级和子级的lexer或tokenSteam，CharStream的逻辑后，

接下来，就可以，参考两者的不同之处，找到antlr v3中，如何去模拟此套逻辑了。

8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程，参见：

【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程

搞懂逻辑后，接下来，才是，如何将其转化为antlr v3版本的代码，实现同样的逻辑。

9.暂时写了如下代码：

100

101

102

103

104

105

106

fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\n')))*;
MACRO_TEXT :    (('\\' '\n') | (~('\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
 
 
DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
     
    String arg0Text = "";
    String arg1Text = "";
    String definedContent = "";
    String defineId = "";
     
}   :   ('#define' WS* defineMacro=RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT 
            {
                definedContent = defineText.getText();
                args.set(0, definedContent);
            }
        )? '\n'
    {
        defineId = defineMacro.getText();
        defines.put(defineId, args );
        skip();
    }
    );
 
IDENTIFIER @init{
    List define = new ArrayList();
    List foundArgs = new ArrayList();
     
    String callArg0Text = "";
    String callArg1Text = "";
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { !((define!=null) && (define.size()>1)) }? 
    |
    { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR
        {
            callArg0Text = callArg0.getText(); 
            foundArgs.add(callArg0Text);
        }
        ( COMMA callArg1=EXPR 
        {
            callArg1Text = callArg1.getText();
            foundArgs.add(callArg1Text);
        }
        )*
        { foundArgs.size()==define.size()-1 }? // better have right amount
        ')'
    )
{
if (define!=null) {
    String defineText = (String)define.get(0);
     
    if (define.size()==1) {
        //only have one value in list -> the defineText is the define para content -> just need replace directly
        setText(defineText);
    } else {
        //add new dict pair: (para, call value)
        for (int i=0;i<foundArgs.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)foundArgs.get(i));
            defineArgs.put( (String)define.get(1+i), arg );
        }
         
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
 
        // switch on new input stream
        setCharStream(new ANTLRStringStream(defineText));
        reset();
    }
}
};

但是还没成功，且遇到一个问题：

【未解决】antlr v3的lexer的条件性匹配

10.

转载请注明：在路上 » 【记录】将antlr v2的C/C++的preprocess，即cpp.g，转换为antlr v3

Post Views: 1,578

与本文相关的文章

订阅在路上