【记录】尝试用antlr处理C代码中的#include

1.真正去实现的时候，可以参考：

Tips on designing a preprocessor for C++ using Antlr

中的例子，去添加对应的action code。

2.参考：

How to do preprocessing in antlr v4?

A list of all available downlads at Soft Gems

Windows Resource File Parser + Converter

下载到 312KB的rc-converter.zip。

3.又从：

[antlr-interest] completed an ANTLR-based cpp preprocessor (#include, #if, #define, etc)

拷贝了一份代码，供后续参考：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

//copy from
//http://www.antlr3.org/pipermail/antlr-interest/2004-July/008778.html
//name to:
//cppLexer.g
 
// Author: Eric Mahurin
// License: just give me credit
 
options {
    language="Java";
}
 
{
 
import java.io.*;
import java.util.*;
import antlr.*;
 
class cpp implements cppLexerTokenTypes {
    public static TokenStreamSelector selector = new TokenStreamSelector();
    public static void main(String[] args) {
        try {
            // will need a stack of lexers for #include and macro calls
            cppLexer mainLexer = new cppLexer(new DataInputStream(System.in));
            mainLexer.selector = selector;
            selector.select(mainLexer);
            for (;;) {
                Token t = selector.nextToken();
                if (t.getType()==Token.EOF_TYPE) break;
                System.out.print(t.getText());
            }
        } catch(Exception e) {
            System.err.println("exception: "+e);
        }
    }
}
 
}
 
class cppLexer extends Lexer;
 
options {
    testLiterals = false;
    k = 4;
}
 
tokens {
    ENDIF ;
}
 
{
    public static TokenStreamSelector selector; // must be assigned externally
    protected static Integer ifState = 1; // -1: no-else false, 0:false, 1: true
    protected static List ifStates = new ArrayList(); // holds nested if conditions
    protected static Map defines = new Hashtable(); // holds the defines
    protected Map defineArgs = new Hashtable(); // holds the args for a macro call
    public void uponEOF() throws TokenStreamException, CharStreamException {
        try {
            selector.pop(); // return to old lexer/stream
            selector.retry();
        } catch (NoSuchElementException e) {
            // return a real EOF if nothing in stack
        }
    }
}
 
DIRECTIVE {
    List args = new ArrayList();
    boolean condition = true;
} : '#'
    ( "include" (WS)? includeFile:STRING {
        if (ifState==1) {
            // found this in examples/java/includeFile
            String name = includeFile.getText();
            name = name.substring(1,name.length()-1);
            try {
                cppLexer sublexer = new cppLexer(new DataInputStream(new FileInputStream(name)));
                sublexer.defines = defines; // want defines to be persistent
                sublexer.setFilename(name);
                selector.push(sublexer);
                selector.retry();
            } catch (FileNotFoundException fnf) {
                System.err.println("cannot find file "+name);
            }
        }
    }
    | "define" WS defineMacro:RAW_IDENTIFIER
    {
        args.add(""); // first element will hold the macro text
    }
        (
            ( '(' // get arguments if you find them (no spaces before left paren)
                (WS)? defineArg0:RAW_IDENTIFIER (WS)? {args.add(defineArg0.getText());}
                ( COMMA (WS)? defineArg1:RAW_IDENTIFIER (WS)? {args.add(defineArg1.getText());} )*
              ')'
            | ' '|'\t'|'\f'
            )
            ( options{greedy=true;}: ' '|'\t'|'\f' )*
            // store the text verbatim - tokenize when called
            defineText:MACRO_TEXT {args.set(0,defineText.getText());}
        )? '\n' {newline();}
    { if (ifState==1) {
        defines.put( defineMacro.getText(), args );
        $setType(Token.SKIP);
    }}
    | "undef" WS undefMacro:RAW_IDENTIFIER { if (ifState==1) {
        defines.remove(undefMacro.getText());
        $setType(Token.SKIP);
    }}
    | ("ifdef"|"ifndef"{condition=false;})
        WS ifMacro:RAW_IDENTIFIER
    {
        ifStates.add(ifState);
        if (ifState==1) {
            condition = (defines.containsKey(ifMacro.getText())==condition);
            ifState = condition?1:0;
        } else {
            ifState = -1;
        }
        if (ifState==1) {
            $setType(Token.SKIP);
        } else {
            // gobble up tokens until ENDIF (could be caused by else)
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        }
    }
    |
        ( "else" // treat like elsif (true)
        | "elsif" WS elsifMacro:RAW_IDENTIFIER {
            condition=defines.containsKey(elsifMacro.getText());
        }
        )
    {
        if (ifState==1) {
            // previous if/elsif was taken - discard rest
            ifState = -1;
            for (;;) {
                try {
                    if (selector.nextToken().getType()==ENDIF) break;
                } catch (TokenStreamRetryException r) {
                    // just continue if someone tried retry
                }
            }
            // retry in case we switched lexers
            selector.retry();
        } else if (ifState==0 && condition) {
            // "elsif" (true) or "else"
            $setType(ENDIF);
            ifState = 1;
        }
    }
    | "endif" {
        condition = (ifState==1);
        try {
            // return to previous if state
            ifState = (Integer)ifStates.remove(ifStates.size()-1);
            if (condition) {
                $setType(Token.SKIP);
            } else {
                // tell if/else/elsif to stop discarding tokens
                $setType(ENDIF);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            // endif with no if
        }
    }
    );
 
IDENTIFIER options {testLiterals=true;} {
    List define = new ArrayList();
    List args = new ArrayList();
} :
    identifier:RAW_IDENTIFIER
    {
        // see if this is a macro argument
        define = (List)defineArgs.get(identifier.getText());
        if (_createToken && define==null) {
            // see if this is a macro call
            define = (List)defines.get(identifier.getText());
        }
    }
    ( { (define!=null) && (define.size()>1) }? (WS|COMMENT)?
        // take in arguments if macro call requires them
        '('
        callArg0:EXPR {args.add(callArg0.getText());}
        ( COMMA callArg1:EXPR {args.add(callArg1.getText());} )*
        { args.size()==define.size()-1 }? // better have right amount
        ')'
    | { !((define!=null) && (define.size()>1)) }?
    )
{ if (define!=null) {
    String defineText = (String)define.get(0);
    if (!_createToken) {
        // just substitute text if called from EXPR - no token created
        $setText(defineText);
    } else {
        // create a new lexer to handle the macro text
        cppLexer sublexer = new cppLexer(new DataInputStream(new StringBufferInputStream(defineText)));
        for (int i=0;i<args.size();++i) {
            // treat macro arguments similar to local defines
            List arg = new ArrayList();
            arg.add((String)args.get(i));
            sublexer.defineArgs.put( (String)define.get(1+i), arg );
        }
        selector.push(sublexer);
        // retry in new lexer
        selector.retry();
    }
}};
 
STRING
    : '"' ( '\\' . | ~('\\'|'"') )* '"' // double quoted string
    | '\'' ( '\\' . | ~('\\'|'\'') )* '\'' // single quoted string
    ;
 
protected MACRO_TEXT :
    ( '\\'! '\n' {newline();} // escaped newline
    | ~'\n'
    )*;
 
 
WS :
    ( ' '
    | '\t'
    | '\f'
    | '\n' {newline();}
    ) { /*$setType(Token.SKIP);*/ };
 
COMMENT :
    ( "//" (~'\n')* '\n' {newline();} // single line comment
    | "/*" ( options{greedy=false;} : '\n' {newline();} | ~('\n') )* "*/" // multi-line comment
    ) { /*$setType(Token.SKIP);*/ };
 
protected RAW_IDENTIFIER : ('a'..'z'|'A'..'Z'|'_')
('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ;
 
NUMBER : ('0'..'9') ('0'..'9'|'a'..'z'|'A'..'Z'|'_')* ; // allow ahpha
suffixes on numbers (i.e. L:long)
 
// group symbols into categories to parse EXPR
LEFT  : '(' | '[' | '{' ;
RIGHT : ')' | ']' | '}' ;
COMMA : ',' ;
OPERATOR : '!' | '#' | '$' | '%' | '&' | '*' | '+' | '-' | '.' | '/' | ':' | ';' | '<' | '=' | '>' | '?' | '@' | '\\' | '^' | '`' | '|' | '~' ;
 
protected EXPR // allow just about anything without being ambiguous
    : (WS)? (NUMBER|IDENTIFIER)?
        (
            ( LEFT EXPR ( COMMA EXPR )* RIGHT
            | STRING
            | OPERATOR // quotes, COMMA, LEFT, and RIGHT not in here
            )
            EXPR
        )?
    ;

4.参考：

How to do preprocessing in antlr v4?

看提到了：

TokenStreamRewriter

不过后来没继续弄。

5.后来是别的同事，搞定了include的，包括递归调用的功能，完整代码如下：

（同时此处后又加了对于define的简单替换）

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

grammar preprocess;
//lexer grammar preprocess;
 
options{
    language=Java;
}
 
@lexer::header {
//package com.mm.antlrv3demo;
 
import java.io.*;
import java.util.*;
}
 
@parser::header {
//package com.mm.antlrv3demo;
}
 
@lexer::members {
    class SaveStruct {
      SaveStruct(CharStream input){
        this.input = input;
        this.marker = input.mark();
      }
      public CharStream input;
      public int marker;
     }
  
     static Map defines = new Hashtable(); // holds the defines
  
     Stack<SaveStruct> includes = new Stack<SaveStruct>();
  
    // We should override this method for handling EOF of included file
     public Token nextToken(){
       Token token = super.nextToken();
  
       if(token.getType() == Token.EOF && !includes.empty()){
        // We've got EOF and have non empty stack.
         SaveStruct ss = includes.pop();
         setCharStream(ss.input);
         input.rewind(ss.marker);
         //this should be used instead of super [like below] to handle exits from nested includes
         //it matters, when the 'include' token is the last in previous stream (using super, lexer 'crashes' returning EOF token)
         token = this.nextToken();
       }
  
      // Skip first token after switching on another input.
      // You need to use this rather than super as there may be nested include files
       if(((CommonToken)token).getStartIndex() < 0)
         token = this.nextToken();
  
       return token;
     }
}
 
COMMENT
    :   ('//' ~('\n'|'\r')* '\r'? '\n') {skip();}
    |   ('/*' ( options {greedy=false;} : . )* '*/') {skip();}
    ;
 
// and lexer rule
INCLUDE    :    '#include' (WS)? f=STRING 
{
    String name = f.getText();
    name = name.substring(1,name.length()-1);
    try {
        // save current lexer's state
        SaveStruct ss = new SaveStruct(input);
        includes.push(ss);
  
        // switch on new input stream
        setCharStream(new ANTLRFileStream(name));
        reset();
 
    } catch(Exception fnf) { throw new Error("Cannot open file " + name); }
};
 
DIRECTIVE   :   ('#define' WS* defineMacro=ID WS* defineText=STRING)
    {
String macroKey = defineMacro.getText();
String macroValue = defineText.getText();
System.out.println("Found macro: " + macroKey + "=" + macroValue);
defines.put(macroKey, macroValue);
skip();
    };
 
ID  :   ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'0'..'9'|'_')*
    {
            // see if this is a macro call
            String idStr = getText();
            if(defines.containsKey(idStr))
            //define = (List)defines.get(identifier.getText());
            //if(define!=null) && (define.size()>1)
            {
                String macroValue = (String)defines.get(idStr);
                System.out.println("Found macro reference, so replce " + idStr + " to " + macroValue);
 
                setText(macroValue);
            }
    };
 
INT :   '0'..'9'+
    ;
 
FLOAT
    :   ('0'..'9')+ '.' ('0'..'9')* EXPONENT?
    |   '.' ('0'..'9')+ EXPONENT?
    |   ('0'..'9')+ EXPONENT
    ;
 
WS  :   ( ' '
        | '\t'
        | '\r'
        | '\n'
        ) {$channel=HIDDEN;}
    ;
 
RestSymbo
    :   '{' | '}' | '&' | ';' | ',' | '+' | '-' | ')' | '(' | '~' | '/' | '`' | '$' | '@' | '%' | '^' | '#' | '\\' ;
 
STRING
    :  '"' ( ESC_SEQ | ~('\\'|'"') )* '"'
    ;
 
CHAR:  '\'' ( ESC_SEQ | ~('\''|'\\') ) '\''
    ;
 
fragment
EXPONENT : ('e'|'E') ('+'|'-')? ('0'..'9')+ ;
 
fragment
HEX_DIGIT : ('0'..'9'|'a'..'f'|'A'..'F') ;
 
fragment
ESC_SEQ
    :   '\\' ('b'|'t'|'n'|'f'|'r'|'\"'|'\''|'\\')
    |   UNICODE_ESC
    |   OCTAL_ESC
    ;
 
fragment
OCTAL_ESC
    :   '\\' ('0'..'3') ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7') ('0'..'7')
    |   '\\' ('0'..'7')
    ;
 
fragment
UNICODE_ESC
    :   '\\' 'u' HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
    ;
     
header
    :   include*;
include :   INCLUDE;//'#include ' '<' ID ('.h' | '.ddl') '>';

【总结】

还是antlr代码，和手动加的action code（java代码）去实现对应的递归处理include的逻辑的。

转载请注明：在路上 » 【记录】尝试用antlr处理C代码中的#include

Post Views: 1,557

与本文相关的文章

订阅在路上