/* vim: set sw=8 ts=8 si : */
/* Author: Guido Socher, Copyright: GPL */
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <unistd.h>
#include <ctype.h>
#include <strings.h>
#include <string.h>
#include "config.h"
/* tags longer than this are ignored:*/
#define MAXSTRLEN 400
#define MAXGREPLEN 320
struct buffer{
	char string[MAXGREPLEN];
	int pos;
};
/*begin global data*/
static int ch; /*current character in file*/
static int l; /*current line number*/
static int lstart; /*line number where tag started*/
static int state;
static int substate;
static struct buffer buf;
static char *buftagstart_ptr;
static struct buffer ctag; /*the current tag name e.g img or font or "li " */
static char **begtlist; /* a list of tags that mark the beginning. 1 tag in */
                        /* the current implementaion */
static char **endtlist; /* a list of tags that mark the end */
/*end global data*/

void help()
{
        printf("taggrep -- search for a given html-tag and display its content\n\
\n\
USAGE: taggrep [-hs] [-c closetag1,closetag2,...] starttag html-files\n\
\n\
OPTIONS: -h this help\n\
         -c Print everything from the starttag until one of the closetags\n\
	    is found. \n\
         -s print in short format; do not print the file names\n\
\n\
EXAMPLE: List all anchor <a ...> tags without the closing ...</a>:\n\
          taggrep a *.html\n\
         List all doc titles:\n\
	  taggrep -c title,head title *.html\n\
	 List all <li> closed by either a second <li> or a </li>,</ul>,</ol>\n\
	  taggrep -c ul,ol,li li *.html\n\
Note: The length of the tags including their enclosed content is on purpose\n\
      limited to 300 characters. This is to limit the output if a tag is\n\
      not closed correctly or if you forgott to add a tag to the list of \n\
      possible closing tags.\n\
");
	exit(0);
}
void addbuf(){
	buf.string[buf.pos]=(char)ch;
	buf.pos++;
}
void addctag(){
	ctag.string[ctag.pos]=(char)ch;
	ctag.pos++;
}
void termctag(){
	ctag.string[ctag.pos]='\0';
	ctag.pos=0; 
}
			
int print_result(int shortprint,char *filename){
	char tmp;

	tmp=ch; /*save*/
	ch='\0';
	addbuf();
	ch=tmp; /*restore*/
	if (shortprint) {
		printf("%s\n",buf.string);
	}else{
		printf("%s:%d: %s\n",filename,lstart,buf.string);
	}
	buf.pos=0;
	ctag.pos=0;
	state=0; 
	return(0);
}
/* store a list of strings which are seperated by a comma (it's 
 * one long string) into an array of strings. The array is NULL 
 * terminated and the strings are stored in lower case */
static char **str_list(const char *incommalist){
	char **list;
	char **liststart;
	char *c_ptr,*commalist;
	int i=2; /* one string and the terminator */

	commalist=(char *)malloc(strlen(incommalist)*sizeof(char)+1);
	strcpy(commalist,incommalist);
	c_ptr=commalist;
	while(*c_ptr){
		if(*c_ptr==','){ 
			i++; 
		}else{
			*c_ptr=(char)tolower((int)*c_ptr);
		}
		c_ptr++;
	}
	list=(char **)malloc(i*sizeof(char*));
	liststart=list;
	c_ptr=commalist;
	i=0;
	while(*commalist){
		if(*commalist==','){
			*commalist='\0';
			if (i>0){
				*list=c_ptr;
				list++;
			}
			c_ptr=commalist+1;
			i=0;
		}
		i++;
		commalist++;
	}
	if (i>0){
		*list=c_ptr;
		list++;
	}
	*list=(char*)NULL;
	return(liststart);
}
/* return 1 if the 2 strings a and b are equal */
int streq(const char *a,const char *b){
	while(*a && *b){
		if(*a != *b) return(0);
		a++;b++;
	}
	/* both must be at \0 */
	if(*a || *b) return(0);
	return(1);
}
/* find out if a given string is in the string list. The string list
 * must be of the format returned by str_list(). The comparison is
 * not case sensitive as the given string is first converted to lower
 * case. */
int is_in_str_list(const char *checkstr,char **list){
	char *c_ptr;
	static char cstr[MAXSTRLEN];
	strncpy(cstr,checkstr,MAXSTRLEN);
	cstr[MAXSTRLEN-1]='\0';
	c_ptr=cstr;
	while(*c_ptr){
		*c_ptr=(char)tolower((int)*c_ptr);
		c_ptr++;
	}
	while(*list){
		if (streq(*list,cstr)){
			return(1);
		}
		list++;
	}
	return(0);
}
/* Do the checks for a begin search tag.
 * This function can only be understood in the context of the switch
 * statement in the main loop */
void check_begin_tag(int opt_c,int opt_s,char *filename){
	termctag();
	if (ch=='>') {
		/* a tag of the form <xxx>*/
		if (is_in_str_list(ctag.string,begtlist)){
			/*ok this is our tag*/
			if (opt_c){
				/*continue read until end tag is found 
				 *if the cont_if_not_beg is set*/
				state=3;
			}else{
				/*just print this tag*/
				print_result(opt_s,filename);
			}
		}else{
			/*wrong tag*/
			buf.pos=0;
			state=0;
		}
	}else{
		/* tag that continues e.g
		 * <xxxx bla=zz>*/
		if (is_in_str_list(ctag.string,begtlist)){
			state=3;
		}else{
			/*wrong tag*/
			buf.pos=0;
			state=0;
		}
	}
}


int main(int argc, char *argv[])
{
	int opt_s=0;
	int opt_c=0;
	int i=0;
	int wasspace=0;

	
	FILE *fd;
	/* The following things are used for getopt: */
        extern char *optarg;
        extern int optind;
        extern int opterr;

	opterr = 0;
	while ((ch = getopt(argc, argv, "c:hs")) != -1) {
		switch (ch) {
		case 'c':
			endtlist=str_list(optarg);
			opt_c=1;
			break;
		case 's':
			opt_s=1;
			break;
		case 'h':
			help(); /*no break, help does not return */
		case '?':
			fprintf(stderr, "ERROR: No such option. -h for help.\n");
			exit(1);
		/*no default action for case */
		}
	}
	if (optind >= argc -1){
		/* we need at least 2 arguments.*/
		help();
	}
	begtlist=str_list(argv[optind]);
	optind++;
	/* We to find the start, we search for </?\w+[ >] */
	while(optind<argc){
		fd=fopen(argv[optind],"r");
		if (fd == NULL){
			fprintf(stderr, "ERROR: can not read %s\n",argv[optind]);
			exit(1);
		}
		/* init */
		l=1; /* line number count */
		state=0;
		wasspace=0;
		while((ch=fgetc(fd))!=EOF){
			if (buf.pos > MAXGREPLEN - 10){
				/* indicate truncated string with dots */
				ch='.';
				for (i=0;i<6;i++) addbuf();
				if (state > 2 && state < 20) print_result(opt_s,argv[optind]);
				state=0;
				continue;
			}
			if (ch=='\n') l++;
			if (ch=='\n'|| ch=='\r' || ch=='\t') ch=' ';
			/* kill repeated space */
			if (ch==' ' && wasspace){
				continue;
			}
			if (wasspace) wasspace=0;
			if (ch==' ') wasspace=1;
			switch (state) {
				case 0: /*outside a relevant tag not reading*/
					if (ch=='<'){
						/*init buffers*/
						buf.pos=0;
						ctag.pos=0;
						lstart=l; /*remember line*/
						state++;
						/* add to tag buffer */
						addbuf();
					}
					/*stay here*/
					break;
				case 1: /*The character(s) after <
					 *do not ignore leading space 
					 *(would be an invalid tag). Check
					 *for closing tag but we are not
					 *reading a relevant tag so we 
					 *terminate on a close tag.
					 *We are looking for a relevant begin
					 *tag. */
					addbuf();
					if (isalnum(ch)) {
						addctag();
						state++;
					}else if (ch=='/') {
						/* closing tag but we search 
						 * a start tag */
						state=0;
						break;
					}else if (ch=='!') {
						/*could be a comment*/
						state=20;
						break;
					}else{
						state=0;
						ctag.pos=0;
					}
					break;
				case 2: /*we are reading the tag name only 
					 *this must be a word only*/
					addbuf();
					if (isalnum(ch)) {
						addctag();
						/*stay here*/
					}else check_begin_tag(opt_c,opt_s,argv[optind]);
					break;
				case 3: /* are reading a relevant tag and 
					 * search for the closing tag or
					 * just the '>' without opt_c or
					 * a new close tag with opt_c */
					addbuf();
					if (opt_c){
						/* with opt_c we look for 
						 * a seperate <xx ...> 
						 * or </xx> */
						if (ch=='<') {
							buftagstart_ptr=buf.string + buf.pos -1;
							state++;
						}
					}else{
						if (ch=='>'){
							print_result(opt_s,argv[optind]);
						}
					}
					break;
				case 4: /*we have opt_c set and started a
					 *new tag, prev char was '<' */
					addbuf();
					if (isalnum(ch)) {
						addctag();
						substate=0;
						state++;
					}else if (ch==' ') {
						/* ok, leading space*/
						break;
					}else if (ch=='/') {
						/* ok, closing tag*/
						substate=0;
						state=10;
					}else{
						/*hmm, ignore it*/
						ctag.pos=0;
						state=3;
					}
					break;
				case 5: /*we are reading a relevant tag with
					 *opt_c set and have found a "<x" 
					 *or "< x" That is a non terminating
					 *tag might be closing our search */ 
					addbuf();
					if (isalnum(ch)) {
						if (substate==0)addctag();
					}else if (ch =='>'){
						/*end of this begin tag.
						 *which may close our search
						 *We have found <xxxx...>*/
						/* tag of the form <xxxx. */
						termctag();
						if (is_in_str_list(ctag.string,endtlist)){
							print_result(opt_s,argv[optind]);
						/* This might be a new
						 * interessting tag.  copy the
						 * buftagstart_ptr first, 
						 * buf.string is already \0 
						 * terminated by the
						 * print_result() */
							 i=0;
							 while(*buftagstart_ptr){
								 buf.string[i]=*buftagstart_ptr;
								 buftagstart_ptr++;
								 i++;
							 }
							 buf.pos=i;
							/*undo the terminate
							 *of ctag before going
							 *to check_begin_tag*/
							ctag.pos=strlen(ctag.string);
							lstart=l; /*remember line*/
							check_begin_tag(opt_c,opt_s,argv[optind]);
							break;
						}else{
							/*wrong tag continue
							 *reading */
							ctag.pos=0;
							state=3;
						}
					}else{
						/* indicate that the
						 * tag name reading must be
						 * stopped even if we find
						 * further words in the tag. */
						substate=1;
					}
					/*stay here until '>'*/
					break;
				case 10: /* we are reading a relevant tag 
					  * with opt_c set and have
					  * found a </ */
					addbuf();
					if (isalnum(ch)) {
						if(substate==0)addctag();
					}else if (ch =='>'){
						/*end of this closing tag.
						 *We have foudn </xxxx...>*/
						termctag();
						if (is_in_str_list(ctag.string,endtlist)){
							print_result(opt_s,argv[optind]);
							/*this can not be a new
							 *start tag.*/
							break;
						}else{
							/*wrong tag continue
							 *reading */
							ctag.pos=0;
							state=3;
						}
					}else{
						/* indicate that the
						 * tag name reading must be
						 * stopped even if we find
						 * further words in the tag. */
						substate=1;
					}
					/*stay here until '>' is found*/
					break;
				/*--------------*/
				case 20: /*comment handling, 
					  *we have found "<!", wait for "<!-" */
					if(ch=='-'){
						state++;
					}else{ 
						addbuf();
						state=1;
					}
					break;
				case 21: /*comment handling, 
					  *we have found "<!-", wait for 
					  *comment termination with "->" */
					if(ch=='-'){
						state++;
					}
					break;
				case 22: /*comment end handling, 
					  *we have found "-", wait for ">" */
					if(ch=='>'){
						state=0;
					}else if (ch=='-'){
						/*stay here*/
						break;
					}else{
						state=21;
					}
					break;
				default:
					fprintf(stderr,"Programm Error, state = %d\n",state);
					exit(1);
				}
		}
		fclose(fd);
		optind++;
	}
	return(0);
}
