最新消息:20210816 当前crifan.com域名已被污染,为防止失联,请关注(页面右下角的)公众号

【记录】尝试用antlr处理C代码中的#include

ANTLR crifan 3232浏览 0评论

1.真正去实现的时候,可以参考:

Tips on designing a preprocessor for C++ using Antlr

中的例子,去添加对应的action code。

2.参考:

How to do preprocessing in antlr v4?

->

A list of all available downlads at Soft Gems

->

Windows Resource File Parser + Converter

下载到 312KB的rc-converter.zip。

 

3.又从:

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

拷贝了一份代码,供后续参考:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
//copy from
//name to:
//cppLexer.g
 
// Author: Eric Mahurin
// License: just give me credit
 
options {
    language="Java";
}
 
{
 
import java.io.*;
import java.util.*;
import antlr.*;
 
class cpp implements cppLexerTokenTypes {
    public static TokenStreamSelector selector = new TokenStreamSelector();
    public static void main(String[] args) {
        try {
            // will need a stack of lexers for #include and macro calls
            cppLexer mainLexer = new cppLexer(new DataInputStream(System.in));
            mainLexer.selector = selector;
            selector.select(mainLexer);
            for (;;) {
                Token t = selector.nextToken();
                if (t.getType()==Token.EOF_TYPE) break;
                System.out.print(t.getText());
            }
        } catch(Exception e) {
            System.err.println("exception: "+e);
        }
    }
}
 
}
 
class cppLexer extends Lexer;
 
options {
    testLiterals = false;
    k = 4;
}
 
tokens {
    ENDIF ;
}
 
{
    public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
}
 
DIRECTIVE {
    List args = new ArrayList();
    boolean condition = true;
} : '#'
    ( "include" (WS)? includeFile:STRING {
        if (ifState==1) {
            // found this in examples/java/includeFile
            String name = includeFile.getText();
            name = name.substring(1,name.length()-1);
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            } catch (FileNotFoundException fnf) {
                System.err.println("cannot find file "+name);
            }
        }
    }
    | "define" WS defineMacro:RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0:RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText:MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n' {newline();}
    { if (ifState==1) {
        defines.put( defineMacro.getText(), args );
        $setType(Token.SKIP);
    }}
    | "undef" WS undefMacro:RAW_IDENTIFIER { if (ifState==1) {
        defines.remove(undefMacro.getText());
        $setType(Token.SKIP);
    }}
    | ("ifdef"|"ifndef"{condition=false;})
        WS ifMacro:RAW_IDENTIFIER
    {
        ifStates.add(ifState);
        if (ifState==1) {
            condition = (defines.containsKey(ifMacro.getText())==condition);
            ifState = condition?1:0;
        } else {
            ifState = -1;
        }
        if (ifState==1) {
            $setType(Token.SKIP);
        } else {
            // gobble up tokens until ENDIF (could be caused by else)
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        }
    }
    |
        ( "else" // treat like elsif (true)
        | "elsif" WS elsifMacro:RAW_IDENTIFIER {
            condition=defines.containsKey(elsifMacro.getText());
        }
        )
    {
        if (ifState==1) {
            // previous if/elsif was taken - discard rest
            ifState = -1;
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        } else if (ifState==0 && condition) {
            // "elsif" (true) or "else"
            $setType(ENDIF);
            ifState = 1;
        }
    }
    | "endif" {
        condition = (ifState==1);
        try {
            // return to previous if state
            ifState = (Integer)ifStates.remove(ifStates.size()-1);
            if (condition) {
                $setType(Token.SKIP);
            } else {
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // endif with no if
        }
    }
    );
 
IDENTIFIER options {testLiterals=true;} {
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (_createToken && define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1:EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{ if (define!=null) {
    String defineText = (String)define.get(0);
    if (!_createToken) {
        // just substitute text if called from EXPR - no token created
        $setText(defineText);
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
        for (int i=0;i<args.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)args.get(i));
            sublexer.defineArgs.put( (String)define.get(1+i), arg );
        }
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};
 
STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;
 
protected MACRO_TEXT :
    ( '\\'! '\n' {newline();} // escaped newline
    | ~'\n'
    )*;
 
 
WS :
    ( ' '
    | '\t'
    | '\f'
    | '\n' {newline();}
    ) { /*$setType(Token.SKIP);*/ };
 
COMMENT :
    ( "//" (~'\n')* '\n' {newline();} // single line comment
    | "/*" ( options{greedy=false;} : '\n' {newline();} | ~('\n') )* "*/" // multi-line comment
    ) { /*$setType(Token.SKIP);*/ };
 
protected RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_')
('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;
 
NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha
suffixes on numbers (i.e. L:long)
 
// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;
 
protected EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;

 

4.参考:

How to do preprocessing in antlr v4?

看提到了:

TokenStreamRewriter

不过后来没继续弄。

5.后来是别的同事,搞定了include的,包括递归调用的功能,完整代码如下:

(同时此处后又加了对于define的简单替换)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
grammar preprocess;
//lexer grammar preprocess;
 
options{
    language=Java;
}
 
@lexer::header {
//package com.mm.antlrv3demo;
 
import java.io.*;
import java.util.*;
}
 
@parser::header {
//package com.mm.antlrv3demo;
}
 
@lexer::members {
    class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
  
     static Map defines = new Hashtable(); // holds the defines
  
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
  
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
  
       return token;
     }
}
 
COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
 
DIRECTIVE   :   ('#define' WS* defineMacro=ID WS* defineText=STRING)
    {
String macroKey = defineMacro.getText();
String macroValue = defineText.getText();
System.out.println("Found macro: " + macroKey + "=" + macroValue);
defines.put(macroKey, macroValue);
skip();
    };
 
ID  :   ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
    {
            // see if this is a macro call
            String idStr = getText();
            if(defines.containsKey(idStr))
            //define = (List)defines.get(identifier.getText());
            //if(define!=null) && (define.size()>1)
            {
                String macroValue = (String)defines.get(idStr);
                System.out.println("Found macro reference, so replce " + idStr + " to " + macroValue);
 
                setText(macroValue);
            }
    };
 
INT :   '0'..'9'+
    ;
 
FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;
 
WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;
 
RestSymbo
    :   '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;
 
STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;
 
CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;
 
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
 
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
 
fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;
 
fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;
 
fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
     
header
    :   include*;
include :   INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

 

【总结】

还是antlr代码,和手动加的action code(java代码)去实现对应的递归处理include的逻辑的。

转载请注明:在路上 » 【记录】尝试用antlr处理C代码中的#include

发表我的评论
取消评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
82 queries in 0.215 seconds, using 22.18MB memory