/* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#ifndef _UDM_PARSE_HTML_H
#define _UDM_PARSE_HTML_H
/* HTML tag types */
enum udm_html_tag
{
UDM_HTML_TAG_UNKNOWN= 0,
UDM_HTML_TAG_A,
UDM_HTML_TAG_TITLE,
UDM_HTML_TAG_HTML,
UDM_HTML_TAG_BODY,
UDM_HTML_TAG_META,
UDM_HTML_TAG_BASE,
UDM_HTML_TAG_LINK,
UDM_HTML_TAG_AREA,
UDM_HTML_TAG_IMG,
UDM_HTML_TAG_FRAME,
UDM_HTML_TAG_IFRAME,
UDM_HTML_TAG_SCRIPT,
UDM_HTML_TAG_STYLE,
UDM_HTML_TAG_NOINDEX,
UDM_HTML_TAG_FONT,
UDM_HTML_TAG_P
};
#define UDM_MAXTAGVAL 64
typedef struct udm_html_tag_st
{
size_t ntoks;
UDM_CONST_ATTR toks[UDM_MAXTAGVAL + 1];
} UDM_HTML_TAG;
void UdmHTMLTagInit(UDM_HTML_TAG *tag);
size_t UdmHTMLTagParse(UDM_HTML_TAG *tag, const char *src, size_t length);
UDM_CONST_STR *UdmHTMLTagFindAttrByName(UDM_HTML_TAG *tag, const char *name, size_t namelen);
/*
Find an attribute that that represents a META tag name:
*/
UDM_CONST_STR *UdmHTMLTagFindAttrMetaName(UDM_HTML_TAG *tag);
/************************************************/
/* HTML parser states */
enum udm_html_token
{
UDM_HTML_TOKEN_UNKNOWN= 0,
UDM_HTML_TOKEN_TAG= 1,
UDM_HTML_TOKEN_TEXT= 2,
UDM_HTML_TOKEN_COMMENT= 3,
UDM_HTML_TOKEN_PI= 4
};
typedef struct udm_html_parser_state_st
{
char script; /* if inside */
char style; /* if inside */
char title; /* if inside
.. */
char body; /* if inside .. */
char ahref; /* if inside .. */
char comment; /* if inside */
} UDM_HTML_PARSER_STATE;
typedef struct udm_html_parser_st
{
UDM_HTML_PARSER_STATE state;
UDM_CONST_STR lasttok;
UDM_HTML_TAG tag;
void *user_data;
udm_rc_t (*tag_handler)(struct udm_html_parser_st *st);
udm_rc_t (*comment_handler)(struct udm_html_parser_st *st);
udm_rc_t (*text_handler)(struct udm_html_parser_st *st);
udm_rc_t (*pi_handler)(struct udm_html_parser_st *st);
} UDM_HTML_PARSER;
udm_rc_t UdmHTMLParserInit(UDM_HTML_PARSER *parser);
void UdmHTMLParserSetUserData(UDM_HTML_PARSER *parser, void *data);
udm_rc_t UdmHTMLParserExec(UDM_HTML_PARSER *parser, const char *src, size_t strlen);
void UdmHTMLParserSetTextHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*));
void UdmHTMLParserSetTagHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*));
void UdmHTMLParserSetCommentHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*));
void UdmHTMLParserSetPIHandler(UDM_HTML_PARSER *p, udm_rc_t (*action)(UDM_HTML_PARSER*));
/**************************************************/
udm_rc_t UdmHTMLParse(UDM_AGENT*, UDM_DOCUMENT*);
udm_rc_t UdmParseURLText(UDM_AGENT*, UDM_DOCUMENT*);
udm_rc_t UdmParseText(UDM_AGENT*, UDM_DOCUMENT*);
udm_rc_t UdmParseHeaders(UDM_AGENT*, UDM_DOCUMENT*);
udm_rc_t UdmMessageRFC822Parse(UDM_AGENT *, UDM_DOCUMENT *);
size_t UdmMessageRFC822ExcerptSource(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
const UDM_CONST_STR *content,
UDM_DSTR *dst);
udm_rc_t UdmMessageRFC822CachedCopy(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
UDM_DSTR *dstr);
udm_rc_t UdmRTFParse(UDM_AGENT *, UDM_DOCUMENT *);
udm_rc_t UdmRTFCachedCopy(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
UDM_DSTR *dstr);
size_t UdmRTFExcerptSource(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
const UDM_CONST_STR *content,
UDM_DSTR *dst);
udm_rc_t UdmDOCXParse(UDM_AGENT *, UDM_DOCUMENT *);
udm_rc_t UdmDOCXCachedCopy(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
UDM_DSTR *dstr);
size_t UdmDOCXExcerptSource(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
const UDM_CONST_STR *content,
UDM_DSTR *dst);
udm_rc_t UdmHTDBParse(UDM_AGENT *, UDM_DOCUMENT *);
size_t UdmHTDBExcerptSource(UDM_AGENT *Agent,
UDM_QUERY *Query,
UDM_DOCUMENT *Doc,
const UDM_CONST_STR *content,
UDM_DSTR *dst);
size_t UdmConvRFC1522(UDM_CONV *conv, char *dst, size_t dstlen,
const char *src, size_t srclen, int flags);
udm_rc_t UdmPrepareWords(UDM_AGENT*, UDM_DOCUMENT*, UDM_WORDLIST *WL);
udm_rc_t UdmPrepareSections(UDM_AGENT *, UDM_DOCUMENT*);
udm_rc_t UdmPrepareRawSections(UDM_AGENT *, UDM_DOCUMENT *);
udm_rc_t UdmDocInternalParserExec(UDM_AGENT *, UDM_DOCUMENT *, udm_content_type_t);
#endif