/* This is a lex(1) file, see http://dinosaur.compilertools.net/ * or http://en.wikipedia.org/wiki/Lex_programming_tool . * * Compilation on UNIX systems is done by * make risDateAdj * On other systems one may need to call lex or flex and cc explicitly: * lex -8 -o risDateAdj.c risDateAdj.l * cc [-std=gnu99] [-s] [-O] -o risDateAdj risDateAdj.c -ll * * The executable works as a filter and patches the Y1, Y2 and PY fields of RIS * files (the standard input) from a * Apr., YYYY * type produced for example by JSTOR or a * YYYY-MM-DD * or a * YYYY * or a * YYYY/DD * type to the standard defined in http://www.refman.com/support/risformat_intro.asp . * * TODO: * ingentaconnect produces lines PY -- ///January 2009 * which also fall into the category of faulty but recoverable inputs. * And this ought also be implemented. * Richard J. Mathar, 2009-02-11 */ %option noyywrap %{ #include #include #include #define Y1PATCH_PMAT_SIZ 7 regex_t preg1, preg2 , preg3 , preg4 ; regmatch_t pmat[Y1PATCH_PMAT_SIZ] ; /* Fit monam month to integer in the range 1 to 12. * @return Return a number from 1 to 12 if the month is recognized, 0 if not. */ int name2Month(const char *monam) { static const char *jan2dec[] = {"jan","feb","mar","apr","may","jun","jul","aug","sep","oct","nov","dec"} ; for(int i=0; i < 12 ; i++) if( strncasecmp(monam,jan2dec[i],3) == 0 ) return i+1 ; return 0 ; } /* Generate a line YYY/MM/DD/otherinfo line on output. * @param tagline the original line, including the tag and the */ void y1patch(const char *yytext) { /* the original tag including the mandatory white space copied thru */ fprintf(yyout,"%.6s",yytext) ; if( regexec(&preg2,yytext+6,0,0,0) == 0 ) { /* The ISO case. Replace dashes by slashes */ fprintf(yyout,"%.4s/%.2s/%.2s/ %s",yytext+6,yytext+11,yytext+14,yytext+17) ; } else if( regexec(&preg1,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 ) { /* pmat[0] all, pmat[1] the mont, pmat[2] any intermediate dots and blanks, pmat[3] the year. */ int mon = name2Month(yytext+6) ; int yr = atoi(yytext+6+pmat[3].rm_so) ; if ( mon ) fprintf(yyout,"%d/%02d// ",yr,mon) ; else fprintf(yyout,"%d/// ",yr) ; /* the original pattern plus any */ fprintf(yyout,"%s",yytext+6) ; } else if( regexec(&preg3,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 ) { /* pmat[0] all, pmat[1] the mont, pmat[2] the date, pmat[3] the year. */ int mon = name2Month(yytext+6) ; int dat = atoi(yytext+6+pmat[2].rm_so) ; int yr = atoi(yytext+6+pmat[3].rm_so) ; if ( mon ) fprintf(yyout,"%d/%02d/%02d/ ",yr,mon,dat) ; else fprintf(yyout,"%d//%02d/ ",yr,dat) ; /* the original pattern plus any */ fprintf(yyout,"%s",yytext+6) ; } else if( regexec(&preg4,yytext+6,Y1PATCH_PMAT_SIZ,pmat,0) == 0 ) { int yr = atoi(yytext+6) ; if ( pmat[1].rm_so >=0 ) { int mon = atoi(yytext+6+pmat[1].rm_so+1) ; fprintf(yyout,"%d/%02d//",yr,mon) ; } else fprintf(yyout,"%d///",yr) ; /* the original */ fprintf(yyout,"%s",yytext+6+pmat[0].rm_eo) ; } else /* give up and copy through */ fprintf(yyout,"%s",yytext+6) ; } #undef Y1PATCH_PMAT_SIZ %} RISTAG "Y1 - "|"PY - "|"Y2 - " DIGIT [0-9] /* the scanner ought be compiled with the "-i" flag to * trigger also on the various upper/lowercase variants of these patterns */ MONTH "Jan"|"Feb"|"Mar"|"Apr"|"May"|"Jun"|"Jul"|"Aug"|"Sep"|"Oct"|"Nov"|"Dec" %% /* Lines which look correct are copied trough as they are. * This means they contain no 3-letter mont acronyms as substrings. */ /* If the tag is immediately followed by one of the TLA (three-letter * acronyms above), or looks like an ISO date we try conversion. */ {RISTAG}{MONTH}.+\n | {RISTAG}{DIGIT}{4}[[:blank:]]*\r?\n | {RISTAG}{DIGIT}{4}\/{DIGIT}{1,2}[[:blank:]]*\r?\n | {RISTAG}{DIGIT}{4}-{DIGIT}{2}-{DIGIT}{2}.+\n { /* debugging * printf("%d\n",__LINE__) ; */ y1patch(yytext) ; } %% int main(int argc, char *argv[]) { /* preg1: at least three characters (matching months supposedly) * followed by any comma, dot and white space, and the year */ int stat =regcomp(&preg1, "\\([[:alpha:]]\\{3,\\}\\)\\([[:space:],\\./]*\\)\\([[:digit:]]\\{4\\}\\)\\(.*\\)", REG_ICASE) ; if ( stat ) fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ; /* preg2: the ISO YYYY-MM-DD format */ stat =regcomp(&preg2, "[[:digit:]]\\{4\\}-[[:digit:]]\\{2\\}-[[:digit:]]\\{2\\}.*", REG_ICASE) ; if ( stat ) fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ; /* preg3: the fullmontname 1-or-2digit-date, 4-digityear */ stat =regcomp(&preg3, "\\([[:alpha:]]\\{3,\\}\\) \\([[:digit:]]\\{1,2\\}\\)[ ,]*\\([[:digit:]]\\{4\\}\\).*", REG_ICASE) ; if ( stat ) fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ; /* preg4: a sole 4-digit year, optionally with a slash and 2-digit month */ stat =regcomp(&preg4, "[[:digit:]]{4}(/[[:digit:]]{1,2})?[[:blank:]]*", REG_ICASE|REG_EXTENDED) ; if ( stat ) fprintf(stderr,"line %d - Internal error %d\n",__LINE__,stat) ; yylex() ; regfree(& preg4) ; regfree(& preg3) ; regfree(& preg2) ; regfree(& preg1) ; return 0 ; }