/* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _UDM_PARSE_HTML_H #define _UDM_PARSE_HTML_H /* HTML tag types */ enum udm_html_tag { UDM_HTML_TAG_UNKNOWN= 0, UDM_HTML_TAG_A, UDM_HTML_TAG_TITLE, UDM_HTML_TAG_HTML, UDM_HTML_TAG_BODY, UDM_HTML_TAG_META, UDM_HTML_TAG_BASE, UDM_HTML_TAG_LINK, UDM_HTML_TAG_AREA, UDM_HTML_TAG_IMG, UDM_HTML_TAG_FRAME, UDM_HTML_TAG_IFRAME, UDM_HTML_TAG_SCRIPT, UDM_HTML_TAG_STYLE, UDM_HTML_TAG_NOINDEX, UDM_HTML_TAG_FONT, UDM_HTML_TAG_P }; #define UDM_MAXTAGVAL 64 typedef struct udm_html_tag_st { size_t ntoks; UDM_CONST_ATTR toks[UDM_MAXTAGVAL + 1]; } UDM_HTML_TAG; void UdmHTMLTagInit(UDM_HTML_TAG *tag); size_t UdmHTMLTagParse(UDM_HTML_TAG *tag, const char *src, size_t length); UDM_CONST_STR *UdmHTMLTagFindAttrByName(UDM_HTML_TAG *tag, const char *name, size_t namelen); /* Find an attribute that that represents a META tag name: */ UDM_CONST_STR *UdmHTMLTagFindAttrMetaName(UDM_HTML_TAG *tag); /************************************************/ /* HTML parser states */ enum udm_html_token { UDM_HTML_TOKEN_UNKNOWN= 0, UDM_HTML_TOKEN_TAG= 1, UDM_HTML_TOKEN_TEXT= 2, UDM_HTML_TOKEN_COMMENT= 3, UDM_HTML_TOKEN_PI= 4 }; typedef struct udm_html_parser_state_st { char script; /* if inside */ char style; /* if inside */ char title; /* if inside .. */ char body; /* if inside .. */ char ahref; /* if inside .. */ char comment; /* if inside */ } UDM_HTML_PARSER_STATE; typedef struct udm_html_parser_st { UDM_HTML_PARSER_STATE state; UDM_CONST_STR lasttok; UDM_HTML_TAG tag; void *user_data; udm_rc_t (*tag_handler)(struct udm_html_parser_st *st); udm_rc_t (*comment_handler)(struct udm_html_parser_st *st); udm_rc_t (*text_handler)(struct udm_html_parser_st *st); udm_rc_t (*pi_handler)(struct udm_html_parser_st *st); } UDM_HTML_PARSER; udm_rc_t UdmHTMLParserInit(UDM_HTML_PARSER *parser); void UdmHTMLParserSetUserData(UDM_HTML_PARSER *parser, void *data); udm_rc_t UdmHTMLParserExec(UDM_HTML_PARSER *parser, const char *src, size_t strlen); void UdmHTMLParserSetTextHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*)); void UdmHTMLParserSetTagHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*)); void UdmHTMLParserSetCommentHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*)); void UdmHTMLParserSetPIHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*)); /**************************************************/ udm_rc_t UdmHTMLParse(UDM_AGENT*, UDM_DOCUMENT*); udm_rc_t UdmParseURLText(UDM_AGENT*, UDM_DOCUMENT*); udm_rc_t UdmParseText(UDM_AGENT*, UDM_DOCUMENT*); udm_rc_t UdmParseHeaders(UDM_AGENT*, UDM_DOCUMENT*); udm_rc_t UdmMessageRFC822Parse(UDM_AGENT *, UDM_DOCUMENT *); size_t UdmMessageRFC822ExcerptSource(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, const UDM_CONST_STR *content, UDM_DSTR *dst); udm_rc_t UdmMessageRFC822CachedCopy(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, UDM_DSTR *dstr); udm_rc_t UdmRTFParse(UDM_AGENT *, UDM_DOCUMENT *); udm_rc_t UdmRTFCachedCopy(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, UDM_DSTR *dstr); size_t UdmRTFExcerptSource(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, const UDM_CONST_STR *content, UDM_DSTR *dst); udm_rc_t UdmDOCXParse(UDM_AGENT *, UDM_DOCUMENT *); udm_rc_t UdmDOCXCachedCopy(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, UDM_DSTR *dstr); size_t UdmDOCXExcerptSource(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, const UDM_CONST_STR *content, UDM_DSTR *dst); udm_rc_t UdmHTDBParse(UDM_AGENT *, UDM_DOCUMENT *); size_t UdmHTDBExcerptSource(UDM_AGENT *Agent, UDM_QUERY *Query, UDM_DOCUMENT *Doc, const UDM_CONST_STR *content, UDM_DSTR *dst); size_t UdmConvRFC1522(UDM_CONV *conv, char *dst, size_t dstlen, const char *src, size_t srclen, int flags); udm_rc_t UdmPrepareWords(UDM_AGENT*, UDM_DOCUMENT*, UDM_WORDLIST *WL); udm_rc_t UdmPrepareSections(UDM_AGENT *, UDM_DOCUMENT*); udm_rc_t UdmPrepareRawSections(UDM_AGENT *, UDM_DOCUMENT *); udm_rc_t UdmDocInternalParserExec(UDM_AGENT *, UDM_DOCUMENT *, udm_content_type_t); #endif