// Copyright (c) 1999-2013 Pavel Rychly, Milos Jakubicek /* * Quick help * * I - resolving ambiguities: * 1) syntactic predicates * (nterm) => nterm * - if nterm matches, use it (instead of alternatives) => order of alternatives matters * * 2) semantic predicates * a) disambiguating: {test==1}? nterm1 nterm2 // can be used only for disambiguating * b) gated: {test==1}? => nterm1 nterm2 // works even for deterministic alternatives, can be used for turning on/off rules * c) validating: nterm1 {valid==1}? // throws exception if predicate fails * * When using disambiguating predicates, it is much better to use: * {LT(1)->getText(LT(1))->compare(LT(1)->getText(LT(1)), "some_token") == 0}? a=ATTR^ nterm1 * than * a=ATTR^ {$a.text->compare($a.text, "some_token") == 0}? nterm1 * since the first variant disambiguates while the second one doesn't. * II - building trees * 1) AST operators (old antlr2 way): ^ denotes root, ! doesn't include * 2) rewriting rules: nterm1 nterm2 -> ^(MYROOT nterm1 nterm2) // creates tree rooted by MYROOT with 2 leaves * You cannot mix 1) and 2) within a single rule alternative!!! */ grammar cqp; options { output = AST; language = C; ASTLabelType = pANTLR3_BASE_TREE; k = 2; } tokens { OPT; REPOPT; SEQ; ANYPOS; BEGSTRUCT; ENDSTRUCT; WHOLESTRUCT; KW_MEET='meet'; KW_UNION='union'; KW_WITHIN='within'; KW_CONTAINING='containing'; KW_MU='MU'; KW_FREQ='f'; KW_WS='ws'; KW_SWAP='swap'; KW_CCOLL='ccoll'; } /* * If you omit the 'parser::' or 'lexer::' prefix, parser is assumed, I'm rather explicit always. */ @parser::includes { #include #include #include "parsop.hh" #include "frsop.hh" #include "corpus.hh" #include "cqpeval.hh" } @lexer::includes { #include "cqpeval.hh" } @lexer::header { // Copyright (c) 1999-2013 Pavel Rychly, Milos Jakubicek } @parser::header { // Copyright (c) 1999-2013 Pavel Rychly, Milos Jakubicek } @lexer::apifuncs { RECOGNIZER->displayRecognitionError = throwEvalQueryException; } @parser::apifuncs { RECOGNIZER->displayRecognitionError = throwEvalQueryException; } WS_ : (' ' | '\t' | '\n' | '\r') { SKIP(); } ; NUMBER: ('0'..'9')+ ; NNUMBER: '-' ('0'..'9')+ ; ATTR : ('a'..'z'|'A'..'Z'|'_') ('a'..'z'|'A'..'Z'|'_'|'0'..'9')* ; REGEXP: '"' ('\\' . |~('"'|'\\'))* '"' { //remove surrounding quotes SETTEXT(GETTEXT()->subString(GETTEXT(),1,GETTEXT()->size-2)); } ; LPAREN: '('; RPAREN: ')'; LBRACKET: '['; RBRACKET: ']'; LBRACE: '{'; RBRACE: '}'; STAR: '*'; PLUS: '+'; QUEST: '?'; BINOR: '|'; BINAND: '&'; DOT: '.'; COMMA: ','; SEMI: ';'; COLON: ':'; EQ: '='; EEQ: '=='; TEQ: '~'; NOT: '!'; LEQ: '<='; GEQ: '>='; LSTRUCT:'<'; RSTRUCT:'>'; SLASH: '/'; POSNUM: '#'; query : // standard query sequence, including MU queries (sequence -> sequence) // possible global constraints (e.g. '& 1.tag = 2.tag') (BINAND globpart -> ^(BINAND $query globpart))? // possible within/containing suffix (NOT? kw=(KW_WITHIN|KW_CONTAINING) wc=within_containing_part -> ^($kw NOT? $query $wc))* //every query ends by ; SEMI ; globpart : globcond (BINAND! globcond)* ; globcond : NUMBER DOT^ ATTR NOT? EQ NUMBER DOT! ATTR | KW_FREQ^ LPAREN! NUMBER DOT! ATTR RPAREN! NOT? (EQ|LEQ|GEQ|LSTRUCT|RSTRUCT) NUMBER ; within_containing_part : (NOT)? ( (LSTRUCT! structure RSTRUCT!) => {throw EvalQueryException (": Deprecated query syntax: use instead of for matching the content of a structure. will match only the beginning.");} LSTRUCT! structure RSTRUCT! // DEPRECATED. TO BE REMOVED. | sequence | within_number | alignedpart ) ; within_number // translate "within NUMBER" as "within []{NUMBER}" : n=NUMBER -> ^(SEQ ^(REPOPT["REp"] ^(LBRACKET["["] ^(ANYPOS["[]"])) ^(REPOPT["RO"] $n $n))) ; structure : ATTR attvallist? ; oneposonly : attvallist SEMI! ; alignedpart : ATTR COLON^ sequence // parallel alignment ; //-------------------- meet/union query -------------------- mupart : LPAREN! (unionop | meetop) RPAREN! ; integer : NUMBER | n=NNUMBER -> ^(NUMBER[$n.text->chars]) ; meetop : KW_MEET^ position position (integer integer)? ; unionop : KW_UNION^ position position ; //-------------------- regular expression query -------------------- sequence : seq (BINOR^ seq)* ; seq : repetition+ -> ^(SEQ repetition+) ; repetition : atomquery ( repopt -> ^(REPOPT["REp"] atomquery repopt) | -> atomquery ) | LSTRUCT ( structure ( SLASH -> ^(WHOLESTRUCT[""] structure) | -> ^(BEGSTRUCT[""] structure) ) | SLASH structure -> ^(ENDSTRUCT[""] structure) ) RSTRUCT ; attvallist : attvaland (BINOR^ attvaland)* ; attvaland : attval (BINAND^ attval)* ; attval : ATTR NOT? ( (EQ^|LEQ^|GEQ^|TEQ^ NUMBER?) REGEXP | e=EEQ^ r=REGEXP { unescapeString($r); } ) | POSNUM n=NUMBER (NNUMBER -> ^($n NNUMBER) | -> ^($n NNUMBER[$n.text->chars]) ) | NOT^ attval | LPAREN! attvallist RPAREN! | KW_WS LPAREN ( l=NUMBER COMMA s=NUMBER -> ^(KW_WS $l $s) | w1=REGEXP COMMA r=REGEXP COMMA w2=REGEXP -> ^(KW_WS $w1 $r $w2) ) RPAREN | KW_SWAP^ LPAREN! NUMBER COMMA! attvallist RPAREN! | KW_CCOLL^ LPAREN! NUMBER COMMA! NUMBER COMMA! attvallist RPAREN! ; atomquery : position | LPAREN (sequence -> sequence) (NOT? KW_WITHIN within_containing_part -> ^(KW_WITHIN NOT? $atomquery within_containing_part) //$atomquery = the part of the rule matched as far, i.e. sequence |NOT? KW_CONTAINING within_containing_part -> ^(KW_CONTAINING NOT? $atomquery within_containing_part) )* RPAREN ; position : oneposition | n=NUMBER c=COLON p=oneposition -> ^(LBRACKET["["] ^($c $n $p)) ; oneposition : LBRACKET ( attvallist -> ^(LBRACKET["["] attvallist) | -> ^(LBRACKET["["] ^(ANYPOS["[]"])) ) RBRACKET | REGEXP -> ^(LBRACKET["["] ^(EQ["="] ATTR["-"] REGEXP)) | TEQ NUMBER? REGEXP -> ^(LBRACKET["["] ^(TEQ["~"] ATTR["-"] NUMBER REGEXP)) | KW_MU {throw EvalQueryException (": Deprecated query syntax: remove the 'MU' keyword from your query");}// KW_MU! mupart //DEPRECATED | mupart ; repopt : STAR -> ^(REPOPT["RO"] NUMBER["0"] NUMBER["-1"]) | PLUS -> ^(REPOPT["RO"] NUMBER["1"] NUMBER["-1"]) | QUEST -> ^(REPOPT["RO"] NUMBER["0"] NUMBER["1"]) | LBRACE n1=NUMBER (COMMA ( n2=NUMBER -> ^(REPOPT["RO"] $n1 $n2) | -> ^(REPOPT["RO"] $n1 NUMBER["-1"]) ) | -> ^(REPOPT["RO"] $n1 $n1) ) RBRACE ; // vim: syntax=antlr3