htmlex - libextractor - GNU libextractor

commit 10764c9eb4ab4e0674c7142f66419a6539f92c1e
parent b5b023205ca70b1ce23c565974e2e6bc58799498
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon, 19 Sep 2005 02:22:15 +0000

htmlex

Diffstat:
M src/plugins/htmlextractor.c  | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------

1 file changed, 84 insertions(+), 9 deletions(-)
diff --git a/src/plugins/htmlextractor.c b/src/plugins/htmlextractor.c
@@ -1,6 +1,6 @@
 /*
      This file is part of libextractor.
-     (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff
+     (C) 2002, 2003, 2004, 2005 Vidyut Samanta and Christian Grothoff
 
      libextractor is free software; you can redistribute it and/or modify
      it under the terms of the GNU General Public License as published
@@ -124,6 +124,13 @@ addKeyword(EXTRACTOR_KeywordType type,
 
 /* ******************** parser helper functions ************** */
 
+static int tagMatch(const char * tag,
+		    const char * s,
+		    const char * e) {
+  return ( ( (e - s) == strlen(tag)) &&
+	   (0 == strncasecmp(tag, s, e-s)) );
+}
+
 static int lookFor(char c, 
 		   size_t * pos, 
 		   const char * data,
@@ -186,6 +193,43 @@ static int lookForMultiple(const char * c,
   return p < size;
 }
 
+static void findEntry(const char * key,
+		      const char * start,
+		      const char * end,
+		      const char ** mstart,
+		      const char ** mend) {
+  size_t len;
+
+  *mstart = NULL;
+  *mend = NULL;
+  len =  strlen(key);
+  while (start < end - len - 1) {
+    start++;
+    if (start[len] != '=') 
+      continue;         
+    if (0 == strncmp(start, 
+		     key,
+		     len)) {
+      start += len+1;
+      *mstart = start;
+      if ( (*start == '\"') ||
+	   (*start == '\'') ) {
+	start++;
+	while ( (start < end) &&
+		(*start != **mstart) )
+	  start++;	
+	(*mstart)++; /* skip quote */
+      } else {
+	while ( (start < end) &&
+		(! isspace(*start)) )
+	  start++;
+      }
+      *mend = start;
+      return;
+    }
+  }
+}
+
 /**
  * Search all tags that correspond to "tagname".  Example:
  * If the tag is <meta name="foo" desc="bar">, and
@@ -200,6 +244,39 @@ static char * findInTags(TagInfo * t,
 			 const char * keyname,
 			 const char * keyvalue,
 			 const char * searchname) {
+  const char * pstart;
+  const char * pend;
+
+  while (t != NULL) {
+    if (tagMatch(tagname,
+		 t->tagStart,
+		 t->tagEnd)) {
+      findEntry(keyname,
+		t->tagEnd,
+		t->dataStart,
+		&pstart,
+		&pend);
+      if ( ( pstart != NULL) &&
+	   (tagMatch(keyvalue,
+		     pstart,
+		     pend)) ) {
+	findEntry(searchname,
+		  t->tagEnd,
+		  t->dataStart,
+		  &pstart,
+		  &pend);
+	if (pstart != NULL) {
+	  char * ret = malloc(pend - pstart + 1);
+	  memcpy(ret, 
+		 pstart,
+		 pend - pstart);
+	  ret[pend-pstart] = '\0';
+	  return ret;
+	}
+      } 
+    }
+    t = t->next;
+  }
   return NULL;
 }
 
@@ -283,10 +360,9 @@ libextractor_html_extract(const char * filename,
       i++;
     } 
     /* abort early if we hit the body tag */
-    if ( (tag.tagEnd - tag.tagStart == strlen("body")) &&
-	 0 == strncasecmp("body",
-			  tag.tagStart,
-			  tag.tagEnd - tag.tagStart))
+    if (tagMatch("body",
+		 tag.tagStart,
+		 tag.tagEnd))
       break; 
   }
 
@@ -341,10 +417,9 @@ libextractor_html_extract(const char * filename,
   
   while (tags != NULL) {
     t = tags;
-    if ( (t->tagEnd - t->tagStart == strlen("title")) &&
-	 0 == strncasecmp("title",
-			  t->tagStart,
-			  t->tagEnd - t->tagStart))
+    if (tagMatch("title",
+		 t->tagStart,
+		 t->tagEnd)) 
       prev = addKeyword(EXTRACTOR_TITLE,
 			convertToUtf8(t->dataStart,
 				      t->dataEnd - t->dataStart,

	libextractor GNU libextractor
	Log \| Files \| Refs \| Submodules \| README \| LICENSE