/* HTML.c ** SIMPLE HTML PARSER WITHOUT ANY PRESENTATION CODE ** ** (c) COPYRIGHT MIT 1995. ** Please first read the full copyright statement in the file COPYRIGH. ** @(#) $Id: HTML.c,v 1.75 1999/01/06 15:38:47 frystyk Exp $ ** ** This generates of a hypertext object. It converts from the ** structured stream interface foo HTML events into the style- ** oriented interface of the HText interface. ** ** HISTORY: ** 8 Jul 94 FM Insulate free() from _free structure element. */ /* Library include files */ #include "wwwsys.h" #include "WWWUtil.h" #include "WWWCore.h" #include "WWWHTML.h" #include "HTML.h" #include "HTextImp.h" #define PUTC(t,c) (*(t)->target->isa->put_character)((t)->target, (c)) #define PUTS(t,s) (*(t)->target->isa->put_string)((t)->target, (s)) #define PUTB(s,b,l) (*(t)->target->isa->put_block)((t)->target, (b), (l)) #define FLUSH_TARGET(t) (*(t)->target->isa->flush)((t)->target) #define FREE_TARGET(t) (*(t)->target->isa->_free)((t)->target) #define ABORT_TARGET(t) (*(t)->target->isa->abort)((t)->target, e) #define MAX_NESTING 40 struct _HTStream { const HTStreamClass * isa; /* .... */ }; struct _HTStructured { const HTStructuredClass * isa; HTRequest * request; HTParentAnchor * node_anchor; HTextImp * text; HTStream * target; HTChunk * title; BOOL in_word; SGML_dtd * dtd; char * comment_start; /* for literate programming */ char * comment_end; BOOL started; int overflow; int * sp; int stack[MAX_NESTING]; }; /* ** Entity values -- for ISO Latin 1 local representation ** This MUST match exactly the table referred to in the DTD! */ #define ENTITY_SIZE 67 static char * ISO_Latin1[ENTITY_SIZE] = { "\306", /* capital AE diphthong (ligature) */ "\301", /* capital A, acute accent */ "\302", /* capital A, circumflex accent */ "\300", /* capital A, grave accent */ "\305", /* capital A, ring */ "\303", /* capital A, tilde */ "\304", /* capital A, dieresis or umlaut mark */ "\307", /* capital C, cedilla */ "\320", /* capital Eth, Icelandic */ "\311", /* capital E, acute accent */ "\312", /* capital E, circumflex accent */ "\310", /* capital E, grave accent */ "\313", /* capital E, dieresis or umlaut mark */ "\315", /* capital I, acute accent */ "\316", /* capital I, circumflex accent */ "\314", /* capital I, grave accent */ "\317", /* capital I, dieresis or umlaut mark */ "\321", /* capital N, tilde */ "\323", /* capital O, acute accent */ "\324", /* capital O, circumflex accent */ "\322", /* capital O, grave accent */ "\330", /* capital O, slash */ "\325", /* capital O, tilde */ "\326", /* capital O, dieresis or umlaut mark */ "\336", /* capital THORN, Icelandic */ "\332", /* capital U, acute accent */ "\333", /* capital U, circumflex accent */ "\331", /* capital U, grave accent */ "\334", /* capital U, dieresis or umlaut mark */ "\335", /* capital Y, acute accent */ "\341", /* small a, acute accent */ "\342", /* small a, circumflex accent */ "\346", /* small ae diphthong (ligature) */ "\340", /* small a, grave accent */ "\046", /* ampersand */ "\345", /* small a, ring */ "\343", /* small a, tilde */ "\344", /* small a, dieresis or umlaut mark */ "\347", /* small c, cedilla */ "\351", /* small e, acute accent */ "\352", /* small e, circumflex accent */ "\350", /* small e, grave accent */ "\360", /* small eth, Icelandic */ "\353", /* small e, dieresis or umlaut mark */ "\076", /* greater than */ "\355", /* small i, acute accent */ "\356", /* small i, circumflex accent */ "\354", /* small i, grave accent */ "\357", /* small i, dieresis or umlaut mark */ "\074", /* less than */ "\040", /* non-breaking space */ "\361", /* small n, tilde */ "\363", /* small o, acute accent */ "\364", /* small o, circumflex accent */ "\362", /* small o, grave accent */ "\370", /* small o, slash */ "\365", /* small o, tilde */ "\366", /* small o, dieresis or umlaut mark */ "\042", /* double quote sign - June 94 */ "\337", /* small sharp s, German (sz ligature) */ "\376", /* small thorn, Icelandic */ "\372", /* small u, acute accent */ "\373", /* small u, circumflex accent */ "\371", /* small u, grave accent */ "\374", /* small u, dieresis or umlaut mark */ "\375", /* small y, acute accent */ "\377", /* small y, dieresis or umlaut mark */ }; PRIVATE char ** CurrentEntityValues = ISO_Latin1; PUBLIC BOOL HTMLUseCharacterSet (HTMLCharacterSet i) { if (i == HTML_ISO_LATIN1) { CurrentEntityValues = ISO_Latin1; return YES; } else { if (SGML_TRACE) HTTrace("HTML Parser. Doesn't support this character set\n"); return NO; } } PRIVATE int HTML_write (HTStructured * me, const char * b, int l) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } /* Look at what we got */ switch (me->sp[0]) { case HTML_COMMENT: break; /* Do Nothing */ case HTML_TITLE: HTChunk_putb(me->title, b, l); /* Fall through */ default: HTextImp_addText(me->text, b, l); } return HT_OK; } PRIVATE int HTML_put_character (HTStructured * me, char c) { return HTML_write(me, &c, sizeof(char)); } PRIVATE int HTML_put_string (HTStructured * me, const char* s) { return HTML_write(me, s, (int) strlen(s)); } PRIVATE void HTML_start_element (HTStructured * me, int element_number, const BOOL * present, const char ** value) { HTChildAnchor * address = NULL; if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } /* Look at what element was started */ switch (element_number) { case HTML_A: if (present[HTML_A_HREF] && value[HTML_A_HREF]) { address = HTAnchor_findChildAndLink( me->node_anchor, /* parent */ present[HTML_A_NAME] ? value[HTML_A_NAME] : NULL, /* Tag */ value[HTML_A_HREF], /* Addresss */ present[HTML_A_REL] && value[HTML_A_REL] ? (HTLinkType) HTAtom_caseFor(value[HTML_A_REL]) : NULL); if (present[HTML_A_TITLE] && value[HTML_A_TITLE]) { HTLink * link = HTAnchor_mainLink((HTAnchor *) address); HTParentAnchor * dest = HTAnchor_parent(HTLink_destination(link)); if (!HTAnchor_title(dest)) HTAnchor_setTitle(dest, value[HTML_A_TITLE]); } HTextImp_foundLink(me->text, element_number, HTML_A_HREF, address, present, value); if (SGML_TRACE) HTTrace("HTML Parser. Anchor `%s\'\n", value[HTML_A_HREF]); } break; case HTML_AREA: if (present[HTML_AREA_HREF] && value[HTML_AREA_HREF]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_AREA_HREF], NULL); HTextImp_foundLink(me->text, element_number, HTML_AREA_HREF, address, present, value); if (SGML_TRACE) HTTrace("HTML Parser. Image map area `%s\'\n", value[HTML_AREA_HREF]); } break; case HTML_BASE: if (present[HTML_BASE_HREF] && value[HTML_BASE_HREF]) { HTAnchor_setBase(me->node_anchor, (char *) value[HTML_BASE_HREF]); if (SGML_TRACE) HTTrace("HTML Parser. New base `%s\'\n", value[HTML_BASE_HREF]); } break; case HTML_BODY: if (present[HTML_BODY_BACKGROUND] && value[HTML_BODY_BACKGROUND]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_BODY_BACKGROUND], NULL); HTextImp_foundLink(me->text, element_number, HTML_BODY_BACKGROUND, address, present, value); if (SGML_TRACE) HTTrace("HTML Parser. Background `%s\'\n", value[HTML_BODY_BACKGROUND]); } break; case HTML_FRAME: if (present[HTML_FRAME_SRC] && value[HTML_FRAME_SRC]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_FRAME_SRC], NULL); HTextImp_foundLink(me->text, element_number, HTML_FRAME_SRC, address, present, value); if (SGML_TRACE) HTTrace("HTML Parser. Frame `%s\'\n", value[HTML_FRAME_SRC]); } break; case HTML_IMG: if (present[HTML_IMG_SRC] && value[HTML_IMG_SRC]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_IMG_SRC], NULL); HTextImp_foundLink(me->text, element_number, HTML_IMG_SRC, address, present, value); } break; case HTML_ISINDEX: HTAnchor_setIndex(me->node_anchor); break; case HTML_LINK: if (present[HTML_LINK_HREF] && value[HTML_LINK_HREF]) { HTParentAnchor * dest = NULL; address = HTAnchor_findChildAndLink( me->node_anchor, /* parent */ present[HTML_A_NAME] ? value[HTML_A_NAME] : NULL, /* Tag */ present[HTML_A_HREF] ? value[HTML_A_HREF] : NULL, /* Addresss */ NULL); /* Rels */ dest = HTAnchor_parent(HTAnchor_followMainLink((HTAnchor *) address)); /* If forward reference */ if ((present[HTML_LINK_REL] && value[HTML_LINK_REL])) { char * strval = NULL; char * ptr = NULL; char * relation = NULL; StrAllocCopy(strval, value[HTML_LINK_REL]); ptr = strval; while ((relation = HTNextLWSToken(&ptr)) != NULL) { HTLink_add((HTAnchor *) me->node_anchor, (HTAnchor *) dest, (HTLinkType) HTAtom_caseFor(relation), METHOD_INVALID); } HT_FREE(strval); } /* If reverse reference */ if ((present[HTML_LINK_REV] && value[HTML_LINK_REV])) { char * strval = NULL; char * ptr = NULL; char * relation = NULL; StrAllocCopy(strval, value[HTML_LINK_REV]); ptr = strval; while ((relation = HTNextLWSToken(&ptr)) != NULL) { HTLink_add((HTAnchor *) dest, (HTAnchor *) me->node_anchor, (HTLinkType) HTAtom_caseFor(relation), METHOD_INVALID); } HT_FREE(strval); } /* If we got any type information as well */ if (present[HTML_LINK_TYPE] && value[HTML_LINK_TYPE]) { if (HTAnchor_format(dest) == WWW_UNKNOWN) HTAnchor_setFormat(dest, (HTFormat) HTAtom_caseFor(value[HTML_LINK_TYPE])); } /* Call out to the layout engine */ HTextImp_foundLink(me->text, element_number, HTML_LINK_HREF, address, present, value); } break; case HTML_META: if (present[HTML_META_NAME] && value[HTML_META_NAME]) { HTAnchor_addMeta (me->node_anchor, value[HTML_META_NAME], (present[HTML_META_CONTENT] && value[HTML_META_CONTENT]) ? value[HTML_META_CONTENT] : ""); } break; case HTML_OBJECT: if (present[HTML_OBJECT_CLASSID] && value[HTML_OBJECT_CLASSID]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_OBJECT_CLASSID], NULL); HTextImp_foundLink(me->text, element_number, HTML_OBJECT_CLASSID, address, present, value); } if (present[HTML_OBJECT_CODEBASE] && value[HTML_OBJECT_CODEBASE]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_OBJECT_CODEBASE], NULL); HTextImp_foundLink(me->text, element_number, HTML_OBJECT_CODEBASE, address, present, value); } if (present[HTML_OBJECT_DATA] && value[HTML_OBJECT_DATA]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_OBJECT_DATA], NULL); HTextImp_foundLink(me->text, element_number, HTML_OBJECT_DATA, address, present, value); } if (present[HTML_OBJECT_ARCHIVE] && value[HTML_OBJECT_ARCHIVE]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_OBJECT_ARCHIVE], NULL); HTextImp_foundLink(me->text, element_number, HTML_OBJECT_ARCHIVE, address, present, value); } if (present[HTML_OBJECT_USEMAP] && value[HTML_OBJECT_USEMAP]) { address = HTAnchor_findChildAndLink(me->node_anchor, NULL, value[HTML_OBJECT_USEMAP], NULL); HTextImp_foundLink(me->text, element_number, HTML_OBJECT_USEMAP, address, present, value); } break; case HTML_PRE: if (me->comment_end) HTextImp_addText(me->text, me->comment_end, strlen(me->comment_end)); break; case HTML_TITLE: HTChunk_clear(me->title); break; } /* Update our parse stack */ if (SGML_findTagContents(me->dtd, element_number) != SGML_EMPTY) { if (me->sp == me->stack) { if (SGML_TRACE) HTTrace("HTML Parser. Maximum nesting of %d exceded!\n", MAX_NESTING); me->overflow++; return; } --(me->sp); me->sp[0] = element_number; } /* Call out to the layout engine */ HTextImp_beginElement(me->text, element_number, present, value); } PRIVATE void HTML_end_element (HTStructured * me, int element_number) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } /* Update our parse stack */ if (me->overflow > 0) { me->overflow--; return; } me->sp++; if (me->sp > me->stack + MAX_NESTING - 1) { if (SGML_TRACE) HTTrace("HTML Parser. Bottom of parse stack reached\n"); me->sp = me->stack + MAX_NESTING - 1; } /* Look at what element was closed */ switch(element_number) { case HTML_TITLE: HTAnchor_setTitle(me->node_anchor, HTChunk_data(me->title)); break; case HTML_PRE: if (me->comment_start) HTextImp_addText(me->text, me->comment_start, strlen(me->comment_start)); break; } /* Call out to the layout engine */ HTextImp_endElement(me->text, element_number); } PRIVATE void HTML_put_entity (HTStructured * me, int entity_number) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } if (entity_number>=0 && entity_numberstarted) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } if (me->comment_end) HTML_put_string(me, me->comment_end); return me->target ? FLUSH_TARGET(me) : HT_OK; } PRIVATE int HTML_unparsedBeginElement (HTStructured * me, const char * b, int l) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } HTextImp_unparsedBeginElement(me->text, b, l); return HT_OK; } PRIVATE int HTML_unparsedEndElement (HTStructured * me, const char * b, int l) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } HTextImp_unparsedEndElement(me->text, b, l); return HT_OK; } PRIVATE int HTML_unparsedEntity (HTStructured * me, const char * b, int l) { if (!me->started) { HTextImp_build(me->text, HTEXT_BEGIN); me->started = YES; } HTextImp_unparsedEntity(me->text, b, l); return HT_OK; } PUBLIC int HTML_free (HTStructured * me) { if (!me->started) HTextImp_build(me->text, HTEXT_BEGIN); if (me->comment_end) HTML_put_string(me, me->comment_end); HTextImp_build(me->text, HTEXT_END); HTChunk_delete(me->title); if (me->target) FREE_TARGET(me); HT_FREE(me); return HT_OK; } PRIVATE int HTML_abort (HTStructured * me, HTList * e) { if (!me->started) HTextImp_build(me->text, HTEXT_BEGIN); HTextImp_build(me->text, HTEXT_ABORT); HTChunk_delete(me->title); if (me->target) ABORT_TARGET(me); HT_FREE(me); return HT_ERROR; } /* Structured Object Class ** ----------------------- */ PRIVATE const HTStructuredClass HTMLPresentation = /* As opposed to print etc */ { "text/html", HTML_flush, HTML_free, HTML_abort, HTML_put_character, HTML_put_string, HTML_write, HTML_start_element, HTML_end_element, HTML_put_entity, HTML_unparsedBeginElement, HTML_unparsedEndElement, HTML_unparsedEntity }; /* Structured Text object ** ---------------------- ** ** The structured stream can generate either presentation, ** or plain text, or HTML. */ PRIVATE HTStructured * HTML_new (HTRequest * request, void * param, HTFormat input_format, HTFormat output_format, HTStream * output_stream) { HTStructured * me = NULL; if (request) { if ((me = (HTStructured *) HT_CALLOC(1, sizeof(HTStructured))) == NULL) HT_OUTOFMEM("HTML_new"); me->isa = &HTMLPresentation; me->dtd = HTML_dtd(); me->request = request; me->node_anchor = HTRequest_anchor(request); me->title = HTChunk_new(128); me->comment_start = NULL; me->comment_end = NULL; me->target = output_stream; me->sp = me->stack + MAX_NESTING - 1; /* Create the text object */ me->text = HTextImp_new(me->request, me->node_anchor, me->target); } return me; } /* HTConverter for HTML to plain text ** ---------------------------------- ** ** This will convert from HTML to presentation or plain text. */ PUBLIC HTStream * HTMLToPlain (HTRequest * request, void * param, HTFormat input_format, HTFormat output_format, HTStream * output_stream) { return SGML_new(HTML_dtd(), HTML_new( request, NULL, input_format, output_format, output_stream)); } /* HTConverter for HTML to C code ** ------------------------------ ** ** C code is like plain text but all non-preformatted code ** is commented out. ** This will convert from HTML to presentation or plain text. */ PUBLIC HTStream * HTMLToC (HTRequest * request, void * param, HTFormat input_format, HTFormat output_format, HTStream * output_stream) { if (output_stream) { HTStructured * html = NULL; (*output_stream->isa->put_string)(output_stream, "/* "); /* Before title */ html = HTML_new(request, NULL, input_format, output_format, output_stream); html->comment_start = "\n/* "; html->dtd = HTML_dtd(); html->comment_end = " */\n"; /* Must start in col 1 for cpp */ return SGML_new(HTML_dtd(), html); } else return HTErrorStream(); } /* Presenter for HTML ** ------------------ ** ** This will convert from HTML to presentation or plain text. ** ** Override this if you have a windows version */ PUBLIC HTStream * HTMLPresent (HTRequest * request, void * param, HTFormat input_format, HTFormat output_format, HTStream * output_stream) { return SGML_new(HTML_dtd(), HTML_new( request, NULL, input_format, output_format, output_stream)); }