Putting it all together

The code to be compiled

All of the code, makefiles and VC++ 4.2 project file, are in the following zip files (pkzip and tar.gz formats):

Here's what we have at this point:

#header

<< // any definitions that you need in the 

   //  generated files

>>





<< // scanner definitions would go here

#include "dlexerbase.h"

#include "dlglexer.h"

#include "atoken.h"

typedef antlrcommontoken antlrtoken;

int main(int argc, char **argv) {

    dlgfileinput in(stdin);

    dlglexer scanner(&in);

    antlrtokenbuffer pipe(&scanner);

    antlrtoken tok;

    scanner.settoken(&tok);

    xlparser xlparser(&pipe);

    xlparser.init(); // initialize

    xlparser.program(); // start first rule return 0;

} >>





// scanner rules



#lexclass START // Not really necessary, but

                // good commentary nonetheless



// White Space

#token "[\ \t\r]"     <<skip();>>

#token "\n"           <<skip(); newline();>>



// Comments

#token "// ~[\n@]* [\n@]" <<skip(); newline();>>



// Literals

#token INTLIT  "[0-9]+"

#token CHARLIT "\' ~[] \'"

#token         "\"" << // start stringlit

                       skip(); mode(string);

                    >>



// Keywords

#token PROGRAM   "program"

#token BEGIN     "begin"

#token END       "end"

#token VAR       "var"

#token CONSTANT  "constant"

#token TYPE      "type"

#token RECORD    "record"

#token ARRAY     "array"

#token OF        "of"

#token PROCEDURE "procedure"

#token PUT       "put"

#token GET       "get"

#token SKIPLINE  "skipLine"

#token NEWLINE   "newLine"

#token EXIT      "exit"

#token WHEN      "when"

#token RETURN    "return"

#token IF        "if"

#token THEN      "then"

#token ELSE      "else"

#token ELSIF     "elsif"

#token WHILE     "while"

#token LOOP      "loop"

#token AND       "and"

#token OR        "or"

#token INTEGER  "Integer"

#token BOOLEAN "Boolean" 



// Operators

#token DOT        "."

#token BECOMES    ":="

#token COLON      ":"

#token SEMI       ";"

#token COMMA      ","

#token EQUALS     "="

#token LBRACKET   "\["

#token RBRACKET   "\]"

#token DOTDOT     ".."

#token LPAREN     "\("

#token RPAREN     "\)"

#token NOT_EQUALS "/="

#token LT         "<"

#token lte        "<="

#token GT         ">"

#token GTE        ">="

#token PLUS       "\+"

#token MINUS      "\-"

#token TIMES      "\*"

#token DIV        "/"

#token MOD        "mod"

#token NOT        "not"



// Identifiers

#token IDENT "[a-zA-Z] [a-zA-Z0-9]*"



// String Literal Processing

// Separate Scanner class!

#lexclass STRING

#token           "\"\"" << more(); replchar('\"');>>

#token BADSTRING "\n"   << replchar('\0');

                           newline();

                           mode(start);

                           /* error message */

                        >>

#token STRINGLIT "\""   << replchar('\0'); 

                           mode(start);

                        >>

#token           "~[]"  <<more();>>



#tokclass STRING_LITERAL {STRINGLIT BADSTRING}

#tokclass ADD_OP {PLUS MINUS}

#tokclass RELATIONAL_OP {EQUALS NOT_EQUALS GT GTE LT LTE}

#tokclass BOOLEAN_OP {AND OR}

#tokclass MULT_OP {TIMES DIV MOD}



class XLParser

{

<<

    // parser definitions go here

    public:

        void init() {

            antlrparser::init();

            // any specific initialization you need

            // (if none, don't override init() 

}>>







program

  : PROGRAM IDENT EQUALS 

        subprogramBody 

    DOT

    "@" // end-of-file

  ;





subprogramBody

  : (basicDecl)*

    (procedureDecl)*

    BEGIN

        statementList

    END IDENT

  ;





basicDecl

  : varDecl

  | constDecl

  | typeDecl

  ;





varDecl

  : VAR identList COLON typeName

    {BECOMES constantValue}

    SEMI

  ;





constDecl

  : CONSTANT identList COLON typeName

    BECOMES constantValue SEMI

  ;





identList

  : IDENT (COMMA IDENT)*

  ;





constantValue

  : INTLIT

  | STRING_LITERAL

  | IDENT

  ;





typeDecl

  : TYPE IDENT EQUALS

    ( arrayDecl

    | recordDecl

    )

    SEMI

  ;





arrayDecl

  : ARRAY LBRACKET integerConstant 

      DOTDOT integerConstant RBRACKET

      OF typeName

  ;



integerConstant

  : INTLIT

  | IDENT // again, a constant...

  ;





recordDecl

  : RECORD (identList COLON typeName SEMI)+ 

    END RECORD

  ;





typeName

  : IDENT

  | INTEGER

  | BOOLEAN

  ;





procedureDecl

  : PROCEDURE IDENT {formalParameters} EQUALS

        subprogramBody

    SEMI

  ;





formalParameters

  : LPAREN parameterSpec (SEMI parameterSpec)*

    RPAREN

  ;





parameterSpec

  : {VAR} identList COLON typeName

  ;



statement

  : exitStatement

  | returnStatement

  | ifStatement

  | loopStatement

  | ioStatement

  | (IDENT (LPAREN|SEMI))? procedureCallStatement

  | assignmentStatement

  ;



statementList

  : statement statementList

  | (endStatement)? endStatement statementList

  | // nothing

  ;



assignmentStatement

  : variableReference BECOMES expression SEMI

  ;





exitStatement

  : EXIT WHEN expression SEMI

  ;



procedureCallStatement

  : IDENT {actualParameters} SEMI

  ;



actualParameters

  : LPAREN { expression (COMMA expression)* } RPAREN

  ;





returnStatement

  : RETURN SEMI

  ;





ifStatement

  : IF ifPart END IF SEMI

  ;



ifPart

  : expression THEN

    statementList

    { ELSIF ifPart

    | ELSE statementList

    }

  ;



loopStatement

  : {WHILE expression} LOOP

        statementList

    END LOOP SEMI

  ;



endStatement

  : END SEMI

  ;



variableReference

  : IDENT

    ( LBRACKET expression RBRACKET

    | DOT IDENT

    )*

  ;



ioStatement

  : PUT LPAREN expression RPAREN SEMI

  | GET LPAREN variableReference RPAREN SEMI

  | SKIPLINE {LPAREN RPAREN} SEMI

  | NEWLINE {LPAREN RPAREN} SEMI

  ;





primitiveElement

  : variableReference

  | constantValue

  | LPAREN expression RPAREN

  ;





booleanNegationExpression

  : (NOT)* primitiveElement

  ;



signExpression

  : (ADD_OP)* booleanNegationExpression

  ;





multiplyingExpression

  : signExpression (MULT_OP signExpression)*

  ;



addingExpression

  : multiplyingExpression (ADD_OP multiplyingExpression)*

  ;



relationalExpression

  : addingExpression (RELATIONAL_OP addingExpression)*

  ;



expression

  : relationalExpression (BOOLEAN_OP relationalExpression)*

  ;

               

Understanding the PCCTS Compilation Process

The Files Involved

At some point I will expand on this a bit more to help understand how all the pieces fit together. For now, it may be a bit sketchy...

The following parts are involved when building your PCCTS parser. (Assume that you save your parser as xl.g:)

xl.g Your grammar source file
XLParser.h generated by ANTLR -- contains the class declaration for the parser that was generated
XLParser.cpp generated by ANTLR -- contains the definition of static class variables for the generated parser
xl.cpp generated by ANTLR -- contains the definition of the rules used in the generated parser
tokens.h generated by ANTLR -- contains the definition of the enumeration type used for token ids
parser.dlg generated by ANTLR -- contains the scanner definitions that will be translated by DLG
DLGLexer.h, DLGLexer.cpp generated by DLG -- the generated scanner
AParser.cpp, AParser.h in the PCCTS/h directory -- base parser class
AToken.h, ATokPtr.h in the PCCTS/h directory -- base token declarations for ANTLR
ATokenBuffer.cpp in the PCCTS/h directory -- token buffer class
DLexerBase.cpp in the PCCTS/h directory -- base scanner class

A Makefile

You can use genmk (provided in the support directory of the pccts distribution) to generate a makefile for your parser. I have provided makefiles for Visual C++ 4.2 and a generic UNIX makefile. You should be able to modify these for your specific platform.

At some point I'll expand this section to describe the way the makefile works. For now, trust it (or understand it, if you dare! Heck, it's not that bad...)

So now we try to compile/make it...

For our first compile, use option -w2 to ANTLR. You'll need to change this in the makefile, or in the "Custom Build Options" for xl.g in Visual C++. You'll see why in a minute... Just make sure you change it back after I make my somewhat lengthy point.

Notice the warnings you get from the -w2 option like

warning: token label has no associated

         rexpr: STRING_LITERAL               

This means one of two things:

This is a weak point in ANTLR (there aren't many...) You need to use the -w2 option to see if you misspelled or just plain forgot to define a token in your scanner. Look through the output. Notice that all of the tokens that are mentioned are token classes except CONST. Hmmm...

Look at the definition of #token CONSTANT. That's the problem. We wrote CONST where we meant CONSTANT. Change CONST to CONSTANT in the grammar and try compiling again...

(As a side note, you may want to pick a suffix like _CLASS to add to token class names; it would make finding out which reported tokens are just classes much easier.)

Note that ANTLR will create token definitions for tokens that are misspelled. In our CONST case, ANTLR happily creates a

#token CONST "CONST"               

definition for us... Note also that the -w2 option is also used to display ambiguities in the grammar that are fixed by predicates that you have added. Therefore, you probably don't want to use -w2 every time you run ANTLR, just once in a while to check for bad tokens...

What happened? It looks like we have a few grammar problems. Let's look at them.

Remember our dicussion about adding END as a statement. Looks like it came back and bit us on the but. But why?

The rules that are giving us trouble are rules like

subprogramBody

  : (basicDecl)*

    (procedureDecl)*

    BEGIN

        ( statement )*

    END IDENT

  ;

               

But I thought we guarded this with predicates for the END statement?

It turns out that syntactic predicates are not "hoisted" like semantic predicates are. (If I am wrong on this, please let me know -- I'm working from what appears to be generated in the code...) A syntactic predicate just says "I have a decision to make -- try this alternative, then the next. That type of information can't be relayed up to an invoking rule's "if lookahead looks like..." check. So how do we handle this now? We can:

If we specify -k 2 on the ANTLR command-line, our problem goes away. (Note, if you try this, remove the syntactic predicates from the grammar.) I'll stick with the predicates because it's a more complex example and you may have to do something like this with another project.

If we just move the predicates up, it may work, but now we have extra predicates all over the grammar. Our goal was to fix the lookahead problem in a single place. So let's create a single place. Instead of using (statement)* to represent a statement list, we'll create a rule called statementList:

statementList

  : statementList statement

  | // nothing

  ;

               

Some of you may be saying "but..." but I'll cut you off because I've spent too much time in yacc-land recently. Silly me coded the rule this way. Yup, left-recursion, an LL no-no, but I'm not too proud to admit I make these mistakes sometimes. ANTLR is very nice about it and reports

error: infinite left-recursion to rule statementList

       from rule statementList               

So I shake my head in disbelief that I typed it (10 minutes before writing this) and change it to

statementList

  : statement statementList

  | // nothing

  ;

               

and compile again...

Better; we're still getting the message about the END statement. But now the conflict is isolated to one spot. The conflict is that we can choose statement when we see "END" in the input stream, or say "we're done with the statementList" because it can be followed by "END." It is in this rule that we need to make the decision. So...

...we could put a syntactic predicate around the call to statement. But that would be incrediby wasteful. A "guess" parse would take place for every statement. Not good. But we still need to make a guess at this point. So what do we do?

We move the endStatement out of the statement rule and into statementList:

statementList

  : statement statementList

  | (endStatement)? endStatement statementList

  | // nothing

  ;

               

And now, the conflict disappears. We have applied the syntactic predicate to the point of the conflict.

After we compile this, we have one last conflict. In primitiveElement we have a choice between constantValue and variableReference which could both be an IDENT! Why do we have IDENT in both places? Because the semantics, (the meanings) of the references are different. This should be a hint that we will need a semantic predicate to properly resolve this (without mangling the grammar.) The semantic predicate we will be using will say "if the IDENT is a constant, match constantValue; if it's not, match variableReference." That type of check requires a symbol table, which we don't have yet. So, we leave this conflict in the grammar. This will produce code for primitiveElement like

primitiveElement() {

    if next token is IDENT || STRING_LITERAL

                           || INTLIT

        call constantValue

    else if next token is IDENT

        call variableReference

    else if next token is LPAREN

        match LPAREN

        call expression

        match RPAREN

    else

        report error

               

which means that and IDENT that could come in a primitiveElement context will be matched through constantValue. This is a bit of a problem, because things like a[4] cannot be matched, as they are handled by variableReference. Since constantValue only uses a single IDENT by itself, we won't lose anything by swapping it with variableReference, so swap 'em!


Building it for Real

OK. At this point, you should have some code that looks like the code linked below:

All of the code, makefiles and VC++ 4.2 project file, are in the following zip files (pkzip and tar.gz formats):

I may have mucked something up, so just in case, get these. I know they are correct... Build them and things should be clean (with the exception of the conflict in primitiveElement.) Now, you can try some simple tests:

The sample code we will use to test the XL compiler is in the following zip files (pkzip and tar.gz formats):

Get these files, unzip them and run them through the xl parser. For example:

 xl < sample1.xl               

Wrapup

All of this HTML is available as

This wraps up the recognizer section of this tutorial. I hope it's been helpful at some level for you. As always, please let me know your comments (good or bad -- helps the ego ya know) at scott@javadude.com.

Unfortunately, I've pretty much stopped work on this, other than prettying it up and adding navigation aids.  Terence Parr and I are working on a Tutorial book for ANTLR 2.0, the new, Java version of PCCTS.  Stay tuned to comp.compilers.tools.pccts  for information!