libextractor

GNU libextractor
Log | Files | Refs | Submodules | README | LICENSE

commit 9b80f03892450f399c6b869d722348d6dd5f2495
parent 5754ecf346e4116a22266ef4fbd3a1b1aef061da
Author: Christian Grothoff <christian@grothoff.org>
Date:   Mon, 14 Dec 2009 18:48:48 +0000

dvi

Diffstat:
Msrc/include/extractor.h | 2+-
Msrc/plugins/Makefile.am | 12++++++------
Asrc/plugins/dvi_extractor.c | 246+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Dsrc/plugins/dviextractor.c | 221-------------------------------------------------------------------------------
4 files changed, 253 insertions(+), 228 deletions(-)

diff --git a/src/include/extractor.h b/src/include/extractor.h @@ -277,7 +277,7 @@ enum EXTRACTOR_MetaType EXTRACTOR_METATYPE_GENERATOR = 103, EXTRACTOR_METATYPE_ENCODED_BY = 121, - EXTRACTOR_METATYPE_PRODUCTVERSION = 90, + EXTRACTOR_METATYPE_PROUCUCTVERSION = 90, EXTRACTOR_METATYPE_DISCLAIMER = 27, EXTRACTOR_METATYPE_FULL_DATA = 137, diff --git a/src/plugins/Makefile.am b/src/plugins/Makefile.am @@ -89,6 +89,7 @@ plugin_LTLIBRARIES = \ libextractor_applefile.la \ libextractor_asf.la \ libextractor_deb.la \ + libextractor_dvi.la \ libextractor_html.la \ libextractor_it.la \ libextractor_mime.la @@ -113,6 +114,11 @@ libextractor_deb_la_LDFLAGS = \ libextractor_deb_la_LIBADD = \ -lz +libextractor_dvi_la_SOURCES = \ + dvi_extractor.c +libextractor_dvi_la_LDFLAGS = \ + $(PLUGINFLAGS) + libextractor_html_la_SOURCES = \ html_extractor.c libextractor_html_la_LDFLAGS = \ @@ -135,7 +141,6 @@ libextractor_mime_la_LDFLAGS = \ OLD_LIBS = \ $(pdfplugin) \ - libextractor_dvi.la \ libextractor_elf.la \ $(extraflac) \ libextractor_flv.la \ @@ -244,11 +249,6 @@ libextractor_id3v24_la_LDFLAGS = \ libextractor_id3v24_la_LIBADD = \ $(top_builddir)/src/common/libextractor_common.la -libextractor_dvi_la_SOURCES = \ - dviextractor.c -libextractor_dvi_la_LDFLAGS = \ - $(PLUGINFLAGS) - if HAVE_ZLIB libextractor_tar_la_SOURCES = \ tarextractor.c diff --git a/src/plugins/dvi_extractor.c b/src/plugins/dvi_extractor.c @@ -0,0 +1,246 @@ +/* + This file is part of libextractor. + (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff + + libextractor is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published + by the Free Software Foundation; either version 2, or (at your + option) any later version. + + libextractor is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU General Public License + along with libextractor; see the file COPYING. If not, write to the + Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. + */ + +#include "platform.h" +#include "extractor.h" + +typedef struct +{ + char *text; + enum EXTRACTOR_MetaType type; +} Matches; + +static Matches tmap[] = { + {"/Title (", EXTRACTOR_METATYPE_TITLE}, + {"/Subject (", EXTRACTOR_METATYPE_SUBJECT}, + {"/Author (", EXTRACTOR_METATYPE_AUTHOR_NAME}, + {"/Keywords (", EXTRACTOR_METATYPE_KEYWORDS}, + {"/Creator (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE }, + {"/Producer (", EXTRACTOR_METATYPE_CREATED_BY_SOFTWARE}, + {NULL, 0}, +}; + +static int +parseZZZ (const char *data, + size_t pos, size_t len, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls) +{ + size_t slen; + size_t end; + int i; + char *value; + + end = pos + len; + slen = strlen ("ps:SDict begin ["); + if (len <= slen) + return 0; + if (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) + return 0; + pos += slen; + while (pos < end) + { + i = 0; + while (tmap[i].text != NULL) + { + slen = strlen (tmap[i].text); + if (pos + slen < end) + { + if (0 == strncmp (&data[pos], tmap[i].text, slen)) + { + pos += slen; + slen = pos; + while ((slen < end) && (data[slen] != ')')) + slen++; + slen = slen - pos; + value = malloc (slen + 1); + value[slen] = '\0'; + memcpy (value, &data[pos], slen); + if (0 != proc (proc_cls, + "dvi", + tmap[i].type, + EXTRACTOR_METAFORMAT_C_STRING, + "text/plain", + value, + strlen (value) +1)) + return 1; + pos += slen + 1; + } + } + i++; + } + pos++; + } + return 0; +} + +static unsigned int +getIntAt (const void *data) +{ + char p[4]; + + memcpy (p, data, 4); /* ensure alignment! */ + return *(unsigned int *) &p[0]; +} + +static unsigned int +getShortAt (const void *data) +{ + char p[2]; + + memcpy (p, data, 2); /* ensure alignment! */ + return *(unsigned short *) &p[0]; +} + + +int +EXTRACTOR_dvi_extract (const unsigned char *data, + size_t size, + EXTRACTOR_MetaDataProcessor proc, + void *proc_cls, + const char *options) +{ + unsigned int klen; + char *comment; + unsigned int pos; + unsigned int opos; + unsigned int len; + unsigned int pageCount; + char pages[16]; + + if (size < 40) + return 0; + if ((data[0] != 247) || (data[1] != 2)) + return 0; /* cannot be dvi or unsupported version */ + klen = data[14]; + + pos = size - 1; + while ((data[pos] == 223) && (pos > 0)) + pos--; + if ((data[pos] != 2) || (pos < 40)) + return 0; + pos--; + pos -= 4; + /* assert pos at 'post_post tag' */ + if (data[pos] != 249) + return 0; + opos = pos; + pos = ntohl (getIntAt (&data[opos + 1])); + if (pos + 25 > size) + return 0; + /* assert pos at 'post' command */ + if (data[pos] != 248) + return 0; + pageCount = 0; + opos = pos; + pos = ntohl (getIntAt (&data[opos + 1])); + while (1) + { + if (pos == (unsigned int) -1) + break; + if (pos + 45 > size) + return 0; + if (data[pos] != 139) /* expect 'bop' */ + return 0; + pageCount++; + opos = pos; + pos = ntohl (getIntAt (&data[opos + 41])); + if (pos == (unsigned int) -1) + break; + if (pos >= opos) + return 0; /* invalid! */ + } + /* ok, now we believe it's a dvi... */ + snprintf (pages, sizeof(pages), "%u", pageCount); + if (0 != proc (proc_cls, + "dvi", + EXTRACTOR_METATYPE_MIMETYPE, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + "application/x-dvi", + strlen ("application/x-dvi") +1)) + return 1; + comment = malloc (klen + 1); + comment[klen] = '\0'; + memcpy (comment, &data[15], klen); + if (0 != proc (proc_cls, + "dvi", + EXTRACTOR_METATYPE_COMMENT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + comment, + strlen (comment) +1)) + { + free (comment); + return 1; + } + free (comment); + if (0 != proc (proc_cls, + "dvi", + EXTRACTOR_METATYPE_PAGE_COUNT, + EXTRACTOR_METAFORMAT_UTF8, + "text/plain", + pages, + strlen (pages) +1)) + return 1; + /* try to find PDF/ps special */ + pos = opos; + while (pos < size - 100) + { + switch (data[pos]) + { + case 139: /* begin page 'bop', we typically have to skip that one to + find the zzz's */ + pos += 45; /* skip bop */ + break; + case 239: /* zzz1 */ + len = data[pos + 1]; + if (pos + 2 + len < size) + if (0 != parseZZZ ((const char *) data, pos + 2, len, proc, proc_cls)) + return 1; + pos += len + 2; + break; + case 240: /* zzz2 */ + len = ntohs (getShortAt (&data[pos + 1])); + if (pos + 3 + len < size) + if (0 != parseZZZ ((const char *) data, pos + 3, len, proc, proc_cls)) + return 1; + pos += len + 3; + break; + case 241: /* zzz3, who uses that? */ + len = (ntohs (getShortAt (&data[pos + 1]))) + 65536 * data[pos + 3]; + if (pos + 4 + len < size) + if (0 != parseZZZ ((const char *) data, pos + 4, len, proc, proc_cls)) + return 1; + pos += len + 4; + break; + case 242: /* zzz4, hurray! */ + len = ntohl (getIntAt (&data[pos + 1])); + if (pos + 1 + len < size) + if (0 != parseZZZ ((const char *) data, pos + 5, len, proc, proc_cls)) + return 1; + pos += len + 5; + break; + default: /* unsupported opcode, abort scan */ + return 0; + } + } + return 0; +} diff --git a/src/plugins/dviextractor.c b/src/plugins/dviextractor.c @@ -1,221 +0,0 @@ -/* - This file is part of libextractor. - (C) 2002, 2003, 2004 Vidyut Samanta and Christian Grothoff - - libextractor is free software; you can redistribute it and/or modify - it under the terms of the GNU General Public License as published - by the Free Software Foundation; either version 2, or (at your - option) any later version. - - libextractor is distributed in the hope that it will be useful, but - WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - General Public License for more details. - - You should have received a copy of the GNU General Public License - along with libextractor; see the file COPYING. If not, write to the - Free Software Foundation, Inc., 59 Temple Place - Suite 330, - Boston, MA 02111-1307, USA. - */ - -#include "platform.h" -#include "extractor.h" - -static EXTRACTOR_KeywordList * -addKeyword (EXTRACTOR_KeywordType type, - char *keyword, EXTRACTOR_KeywordList * next) -{ - EXTRACTOR_KeywordList *result; - - if (keyword == NULL) - return next; - result = malloc (sizeof (EXTRACTOR_KeywordList)); - result->next = next; - result->keyword = keyword; - result->keywordType = type; - return result; -} - -typedef struct -{ - char *text; - EXTRACTOR_KeywordType type; -} Matches; - -static Matches tmap[] = { - {"/Title (", EXTRACTOR_TITLE}, - {"/Subject (", EXTRACTOR_SUBJECT}, - {"/Author (", EXTRACTOR_AUTHOR}, - {"/Keywords (", EXTRACTOR_KEYWORDS}, - {"/Creator (", EXTRACTOR_CREATOR}, - {"/Producer (", EXTRACTOR_PRODUCER}, - {NULL, 0}, -}; - -static struct EXTRACTOR_Keywords * -parseZZZ (const char *data, - size_t pos, size_t len, struct EXTRACTOR_Keywords *prev) -{ - size_t slen; - size_t end; - int i; - char *value; - - end = pos + len; - slen = strlen ("ps:SDict begin ["); - if (len <= slen) - return prev; - if (0 != strncmp ("ps:SDict begin [ ", &data[pos], slen)) - return prev; - pos += slen; - while (pos < end) - { - i = 0; - while (tmap[i].text != NULL) - { - slen = strlen (tmap[i].text); - if (pos + slen < end) - { - if (0 == strncmp (&data[pos], tmap[i].text, slen)) - { - pos += slen; - slen = pos; - while ((slen < end) && (data[slen] != ')')) - slen++; - slen = slen - pos; - value = malloc (slen + 1); - value[slen] = '\0'; - memcpy (value, &data[pos], slen); - prev = addKeyword (tmap[i].type, value, prev); - pos += slen + 1; - } - } - i++; - } - pos++; - } - return prev; -} - -static unsigned int -getIntAt (const void *data) -{ - char p[4]; - - memcpy (p, data, 4); /* ensure alignment! */ - return *(unsigned int *) &p[0]; -} - -static unsigned int -getShortAt (const void *data) -{ - char p[2]; - - memcpy (p, data, 2); /* ensure alignment! */ - return *(unsigned short *) &p[0]; -} - -struct EXTRACTOR_Keywords * -libextractor_dvi_extract (const char *filename, - const unsigned char *data, - size_t size, struct EXTRACTOR_Keywords *prev) -{ - unsigned int klen; - char *comment; - unsigned int pos; - unsigned int opos; - unsigned int len; - unsigned int pageCount; - char *pages; - - if (size < 40) - return prev; - if ((data[0] != 247) || (data[1] != 2)) - return prev; /* cannot be dvi or unsupported version */ - klen = data[14]; - - pos = size - 1; - while ((data[pos] == 223) && (pos > 0)) - pos--; - if ((data[pos] != 2) || (pos < 40)) - return prev; - pos--; - pos -= 4; - /* assert pos at 'post_post tag' */ - if (data[pos] != 249) - return prev; - opos = pos; - pos = ntohl (getIntAt (&data[opos + 1])); - if (pos + 25 > size) - return prev; - /* assert pos at 'post' command */ - if (data[pos] != 248) - return prev; - pageCount = 0; - opos = pos; - pos = ntohl (getIntAt (&data[opos + 1])); - while (1) - { - if (pos == (unsigned int) -1) - break; - if (pos + 45 > size) - return prev; - if (data[pos] != 139) /* expect 'bop' */ - return prev; - pageCount++; - opos = pos; - pos = ntohl (getIntAt (&data[opos + 41])); - if (pos == (unsigned int) -1) - break; - if (pos >= opos) - return prev; /* invalid! */ - } - /* ok, now we believe it's a dvi... */ - pages = malloc (16); - snprintf (pages, 16, "%u", pageCount); - comment = malloc (klen + 1); - comment[klen] = '\0'; - memcpy (comment, &data[15], klen); - prev = addKeyword (EXTRACTOR_MIMETYPE, strdup ("application/x-dvi"), prev); - prev = addKeyword (EXTRACTOR_COMMENT, comment, prev); - prev = addKeyword (EXTRACTOR_PAGE_COUNT, pages, prev); - /* try to find PDF/ps special */ - pos = opos; - while (pos < size - 100) - { - switch (data[pos]) - { - case 139: /* begin page 'bop', we typically have to skip that one to - find the zzz's */ - pos += 45; /* skip bop */ - break; - case 239: /* zzz1 */ - len = data[pos + 1]; - if (pos + 2 + len < size) - prev = parseZZZ ((const char *) data, pos + 2, len, prev); - pos += len + 2; - break; - case 240: /* zzz2 */ - len = ntohs (getShortAt (&data[pos + 1])); - if (pos + 3 + len < size) - prev = parseZZZ ((const char *) data, pos + 3, len, prev); - pos += len + 3; - break; - case 241: /* zzz3, who uses that? */ - len = (ntohs (getShortAt (&data[pos + 1]))) + 65536 * data[pos + 3]; - if (pos + 4 + len < size) - prev = parseZZZ ((const char *) data, pos + 4, len, prev); - pos += len + 4; - break; - case 242: /* zzz4, hurray! */ - len = ntohl (getIntAt (&data[pos + 1])); - if (pos + 1 + len < size) - prev = parseZZZ ((const char *) data, pos + 5, len, prev); - pos += len + 5; - break; - default: /* unsupported opcode, abort scan */ - return prev; - } - } - return prev; -}