最新消息:20210816 当前crifan.com域名已被污染,为防止失联,请关注(页面右下角的)公众号

【记录】将antlr v2的C/C++的preprocess,即cpp.g,转换为antlr v3

ANTLR crifan 2820浏览 0评论

【背景】

需要用antlr实现C语言的预处理:

include,define等等内容。

参考了:

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

已经实现了部分的事情。

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
grammar preprocess;
//lexer grammar preprocess;
 
options{
    language=Java;
    output = AST;
}
 
@lexer::header {
//package com.mm.antlrv3demo;
 
import java.io.*;
import java.util.*;
}
 
@parser::header {
//package com.mm.antlrv3demo;
}
 
@lexer::members {
    //public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    /*
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
    */
     
    class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
  
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
  
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
  
       return token;
     }
}
 
COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
/*
fragment
NON_CR_LF   :   ~('\r'|'\n');
 
fragment
TAB_SPACE
    :   (' ' | '\t');
*/
 
//DIRECTIVE     :   ('#define' WS* defineMacro=ID WS* defineText=STRING)
//DIRECTIVE     :   ('#define' WS* defineMacro=ID WS* defineText=( NON_CR_LF+ | (NON_CR_LF* (TAB_SPACE+ '\\' '\r'? '\n' NON_CR_LF+)*) ) )
 
 
fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
 
DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
}   :   ('#define' WS* defineMacro=RAW_IDENTIFIER
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n'
    {
        defines.put( defineMacro.getText(), args );
        skip();
    }
    );
 
IDENTIFIER @init{
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1=EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{
if (define!=null) {
    String defineText = (String)define.get(0);
 
    // create a new lexer to handle the macro text
    preprocessLexer sublexer = new preprocessLexer(new DataInputStream(new StringBufferInputStream(defineText)));
    for (int i=0;i<args.size();++i) {
        // treat macro arguments similar to local defines
        List arg = new ArrayList();
        arg.add((String)args.get(i));
        sublexer.defineArgs.put( (String)define.get(1+i), arg );
    }
    selector.push(sublexer);
    // retry in new lexer
    selector.retry();
 
}
};
 
fragment RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;
 
NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha suffixes on numbers (i.e. L:long)
 
// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;
 
 
fragment EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;
 
//INT : '0'..'9'+    ;
 
FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;
 
WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;
 
//RestSymbo :   '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;
 
STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;
 
CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;
 
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
 
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
 
fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;
 
fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;
 
fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
     
header
    :   include*;
include :   INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

 

但是还是遇到很多问题,其中主要就是,针对于旧的antlr v2的TokenStreamSelector,如何换用成antlr v3的逻辑,用哪些函数和类替代。

【折腾过程】

1.关于预处理的问题,这人:

[antlr-interest] C PreProcessor Errors

也遇到类似的事情,但是对此处没啥帮助。

2.这里:

[antlr-interest] ANTLR 3 migration: TokenStreamSelector

和:

[antlr-interest] TokenStreamSelector + ANTLRv3

也提到了,v2转v3时,如何处理TokenStreamSelector,但是没人回答。

 

3.这里:

Tips on designing a preprocessor for C++ using Antlr

关于预处理,已经解释的很全了,但是还是antlr v2的版本,还是不能完全透彻的理解,还是无法找到TokenStreamSelector的替代品。

4.google搜:

antlr TokenStream Selector deprecated

看到了“Token Stream Multiplexing”,所以,去找找antlr作者写的书

The Definitive ANTLR Reference.pdf

看看其中关于此部分的解释,或许可以找到有价值的参考资料。

5.另外,顺便提示一句,上述代码中的那个:

testLiterals

实际上是antlr v2的语法

根据:

Migrating from ANTLR 2 to ANTLR 3

的某人评论,得知此testLiterals,antlr v3中也没了。

6.参考:

[antlr-interest] v3 – How to deal with include Files?

也讨论了类似问题,但是还是无解。

7.自己看代码,有一点点眉目了:

(1)antlr v2中的处理新的lexer(和tokenStream)的逻辑

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
    public static TokenStreamSelector selector; // must be assigned externally
    protected static Map defines = new Hashtable(); // holds the defines
     
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        }
        ......
    }
     
 
: '#'
    ( "include" (WS)? includeFile:STRING {
            ......
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            }
            ......
        }
    }
......
 
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
         
        ......
         
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};

即,主要是:

用new cppLexer新建一个sublexer,

然后初始化一堆东西,比如:

给对应的给全局变量defines去赋值等等

然后就转到新的sublexer去处理了,调用方法是:

先push

再retry

 

而后,对于新的lexer,都有对应的uponEOF,

其中目的是遇到了EOF,要返回之前的(父级的lexer,所以

先去pop(返回到上一级,父级的lexer)

再去retry(相当于刷新,去使用当前的,父级的lexer)

 

(2)而与此相对应的,目前已经实现了,antlr v3的,处理新的lexer(和tokenStream)的代码是:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       ......
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
          
         ......
       }
  
      ......
     }
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING
{
    ......
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    }
    ......
};

逻辑是:

也是,对于遇到了要include的文件,

类似于新的lexer

然后先去新建一个,全局的那个SaveStruct

将其保存起来,即push,即压栈

然后使用当前新的CharStream

然后用reset,使得回到文件最开始处,再重新处理

这样,就是:

先保存了旧的,父级的lexer(tokenStream)

然后用当前child级别的lexer去处理新的内容

 

处理完成后,即遇到了EOF

然后会在上面的nextToken中遇到

会去对于全局的变量includes,去pop,拿出来,之前保存的父级的lexer

然后通过setCharStream把后续要处理的内容拿出来

再通过input.rewind,定位到之前记录的位置,

就可以继续去处理了。

 

以此实现了递归的调用。

而基本明白了递归调用,递归处理父级和子级的lexer或tokenSteam,CharStream的逻辑后,

接下来,就可以,参考两者的不同之处,找到antlr v3中,如何去模拟此套逻辑了。

8.关于cppLexer.g中的多参数的#define实现宏替换的逻辑过程,参见:

【整理】分析cppLexer.g中的多参数的#define实现宏替换的逻辑过程

搞懂逻辑后,接下来,才是,如何将其转化为antlr v3版本的代码,实现同样的逻辑。

9.暂时写了如下代码:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
fragment
//MACRO_TEXT :    ( (('\\'){skip();System.out.println("skip line tail back slash");} '\r'? '\n')
//MACRO_TEXT :    ( ('\\'{$channel=HIDDEN;System.out.println("set back slash to hidden");} '\r'? '\n')
//MACRO_TEXT :    ( (('\\'){setText("");System.out.println("set back slash to empty");} '\r'? '\n')
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\r'|'\n')))*;
//MACRO_TEXT :    (('\\' '\r'? '\n') | (~('\n')))*;
MACRO_TEXT :    (('\\' '\n') | (~('\n')))*;
//MACRO_TEXT :    ( ('\\' '\r'? '\n') | (~('\r'|'\n')))* -> ( ('\r'? '\n') | (~('\r'|'\n')))*;
 
 
DIRECTIVE @init{
    List args = new ArrayList();
    boolean condition = true;
     
    String arg0Text = "";
    String arg1Text = "";
    String definedContent = "";
    String defineId = "";
     
}   :   ('#define' WS* defineMacro=RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0=RAW_IDENTIFIER (WS)? {arg0Text = defineArg0.getText(); args.add(arg0Text);}
                ( ',' (WS)? defineArg1=RAW_IDENTIFIER (WS)? {arg1Text = defineArg1.getText(); args.add(arg1Text);} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText=MACRO_TEXT
            {
                definedContent = defineText.getText();
                args.set(0, definedContent);
            }
        )? '\n'
    {
        defineId = defineMacro.getText();
        defines.put(defineId, args );
        skip();
    }
    );
 
IDENTIFIER @init{
    List define = new ArrayList();
    List foundArgs = new ArrayList();
     
    String callArg0Text = "";
    String callArg1Text = "";
} :
    identifier=RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { !((define!=null) && (define.size()>1)) }?
    |
    { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0=EXPR
        {
            callArg0Text = callArg0.getText();
            foundArgs.add(callArg0Text);
        }
        ( COMMA callArg1=EXPR
        {
            callArg1Text = callArg1.getText();
            foundArgs.add(callArg1Text);
        }
        )*
        { foundArgs.size()==define.size()-1 }? // better have right amount
        ')'
    )
{
if (define!=null) {
    String defineText = (String)define.get(0);
     
    if (define.size()==1) {
        //only have one value in list -> the defineText is the define para content -> just need replace directly
        setText(defineText);
    } else {
        //add new dict pair: (para, call value)
        for (int i=0;i<foundArgs.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)foundArgs.get(i));
            defineArgs.put( (String)define.get(1+i), arg );
        }
         
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
 
        // switch on new input stream
        setCharStream(new ANTLRStringStream(defineText));
        reset();
    }
}
};

但是还没成功,且遇到一个问题:

【未解决】antlr v3的lexer的条件性匹配

 

10.

转载请注明:在路上 » 【记录】将antlr v2的C/C++的preprocess,即cpp.g,转换为antlr v3

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
89 queries in 0.476 seconds, using 22.24MB memory