From b1509f22892dc98057c750e7fae39ded5cea3b09 Mon Sep 17 00:00:00 2001 From: Kirill Volinsky Date: Sat, 19 May 2012 18:01:32 +0000 Subject: added MirOTR git-svn-id: http://svn.miranda-ng.org/main/trunk@83 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c --- plugins/MirOTR/ekhtml/src/ekhtml.c | 394 +++++++++++++++++++++++++++++++++++++ 1 file changed, 394 insertions(+) create mode 100644 plugins/MirOTR/ekhtml/src/ekhtml.c (limited to 'plugins/MirOTR/ekhtml/src/ekhtml.c') diff --git a/plugins/MirOTR/ekhtml/src/ekhtml.c b/plugins/MirOTR/ekhtml/src/ekhtml.c new file mode 100644 index 0000000000..f3697469f6 --- /dev/null +++ b/plugins/MirOTR/ekhtml/src/ekhtml.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2002, Jon Travis + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * ekhtml: The El-Kabong HTML parser + * by Jon Travis (jtravis@p00p.org) + * + * El-Kabong: A speedy, yet forgiving, SAX-stylee HTML parser. + * + * The idea behind this parser is for it to use very little memory, and still + * be very speedy, while forgiving poorly written HTML. + + * The internals of the parser consist of a small memory buffer which is able + * to grow when not enough information is known to correctly parse a tag. + * Given the typical layout of HTML, 4k should be plenty. + * + * The main state engine loops through this internal buffer, determining what + * the next state should be. Once this is known, it passes off a segment to + * the state handlers (starttag, endtag, etc.) to process. The segment + * handlers and the main state engine communicate via a few variables. These + * variables indicate whether or not the main engine should switch state, + * or successfully remove some data, etc. The segment handlers are + * guaranteed the same starting data (though not the same pointer) on each + * invocation until the state is changed. Thus, the segment handlers cannot + * use pointers into the main buffer -- they must use offsets. + * + * Some of the speed is gained from using character map data found in + * ekhtml_tables.h. I don't have any empirical data for this yet -- + * it only sounds like it would be faster.. ;-) + * + * I'm always looking for ways to clean && speed up this code. Feel free + * to give feedback -- JMT + */ + + +#include +#include +#include +#include +#include + +#include "ekhtml_config.h" +#include "ekhtml.h" +#define EKHTML_USE_TABLES +#include "ekhtml_tables.h" +#define EKHTML_USE_PRIVATE +#include "ekhtml_private.h" + +#ifndef MIN +#define MIN(a,b) (((a)<(b)) ? (a) : (b)) +#endif + + +/* + * ekhtml_buffer_grow: Grow the parser's internal buffer by a blocksize. + * NOTE: Calling the function has the potential to + * change the data buffer location. Do + * not rely on it's location! + * + * Arguments: Parser = Parser to grow + */ + +static void ekhtml_buffer_grow(ekhtml_parser_t *parser){ + size_t newsize; + char *newbuf; + + newsize = parser->nalloced + EKHTML_BLOCKSIZE; + + if((newbuf = realloc(parser->buf, newsize)) == NULL){ + fprintf(stderr, "BAD! Can't allocate %d bytes in ekhtml_buffer_grow\n", + newsize); + fflush(stderr); /* Just in case someone changes the buffering scheme */ + } + + parser->buf = newbuf; + parser->nalloced = newsize; +} + +/* + * parser_state_determine: Determine the next state that the main parser + * should have, by investigating up to the first + * 4 characters in the buffer. + * + * Arguments: startp = Starting data pointer + * endp = Pointer to first byte of 'out of range' data + * + * Return values: Returns one of EKHTML_STATE_* indicating the state that + * was found. + * + */ + +static inline +int parser_state_determine(const char *startp, const char *endp){ + const char *firstchar; + int newstate; + + assert(startp != endp); + + if(*startp != '<') + return EKHTML_STATE_INDATA; + + firstchar = startp + 1; + if(firstchar == endp) + return EKHTML_STATE_NONE; + + newstate = EKCMap_EKState[(unsigned char)*firstchar]; + if(newstate == EKHTML_STATE_NONE){ + if(firstchar + 2 >= endp) /* Not enough data to evaluate */ + return EKHTML_STATE_NONE; + if(*(firstchar + 1) == '-' && *(firstchar + 2) == '-') + return EKHTML_STATE_COMMENT; + else + return EKHTML_STATE_SPECIAL; + } else + return newstate; +} + + +int ekhtml_parser_flush(ekhtml_parser_t *parser, int flushall){ + void **state_data = &parser->state.state_data; + char *buf = parser->buf, *curp = buf, *endp = buf + parser->nbuf; + int badp = -1, tmpstate = parser->state.state, didsomething = 0; + + while(curp != endp){ + char *workp = curp; + + if(tmpstate == EKHTML_STATE_NONE){ + tmpstate = parser_state_determine(workp, endp); + if(tmpstate == EKHTML_STATE_NONE) /* Not enough data yet */ + break; + } + + if(tmpstate == EKHTML_STATE_INDATA || tmpstate == EKHTML_STATE_BADDATA) + curp = ekhtml_parse_data(parser, workp, endp, tmpstate); + else if(endp - workp > 2){ /* All tags fall under this catagory */ + switch(tmpstate){ + case EKHTML_STATE_ENDTAG: + curp = ekhtml_parse_endtag(parser, state_data, + workp, endp, &badp); + break; + case EKHTML_STATE_STARTTAG: + curp = ekhtml_parse_starttag(parser, state_data, + workp, endp, &badp); + break; + case EKHTML_STATE_COMMENT: + curp = ekhtml_parse_comment(parser, state_data, + workp, endp, &badp); + break; + case EKHTML_STATE_SPECIAL: + curp = ekhtml_parse_special(parser, state_data, + workp, endp, &badp); + break; + default: + assert(!"Unimplemented state"); + } + } else { + curp = NULL; /* Not enough data, keep going */ + } + + /* If one of the parsers said the data was bad, reset the state */ + if(badp != -1){ + tmpstate = badp; + badp = -1; + } + + if(curp == NULL){ /* State needed more data, so break out */ + curp = workp; + break; + } + + if(workp != curp){ /* state backend cleared up some data */ + didsomething = 1; + tmpstate = EKHTML_STATE_NONE; + assert(*state_data == NULL); + } + } + + if(flushall){ + /* Flush whatever we didn't use */ + if(parser->datacb){ + ekhtml_string_t str; + + str.str = curp; + str.len = endp - curp; + parser->datacb(parser->cbdata, &str); + } + curp = endp; + didsomething = 1; + tmpstate = EKHTML_STATE_NONE; /* Clean up to an unknown state */ + *state_data = NULL; + } + + parser->state.state = tmpstate; + + if(didsomething){ + /* Shuffle the data back, based on where we ended up */ + parser->nbuf -= curp - buf; + if(endp - curp){ /* If there's still any data to move */ + memmove(buf, curp, endp - curp); + } + } + return didsomething; +} + +void ekhtml_parser_feed(ekhtml_parser_t *parser, ekhtml_string_t *str){ + size_t nfed = 0; + + while(nfed != str->len){ + size_t tocopy; + + /* First see how much we can fill up our internal buffer */ + tocopy = MIN(parser->nalloced - parser->nbuf, str->len - nfed); + memcpy(parser->buf + parser->nbuf, str->str + nfed, tocopy); + nfed += tocopy; + parser->nbuf += tocopy; + if(parser->nalloced == parser->nbuf){ + /* Process the buffer */ + if(!ekhtml_parser_flush(parser, 0)){ + /* If we didn't actually process anything, grow our buffer */ + ekhtml_buffer_grow(parser); + } + } + } +} + +void ekhtml_parser_datacb_set(ekhtml_parser_t *parser, ekhtml_data_cb_t cb){ + parser->datacb = cb; +} + +void ekhtml_parser_commentcb_set(ekhtml_parser_t *parser, ekhtml_data_cb_t cb){ + parser->commentcb = cb; +} + +void ekhtml_parser_cbdata_set(ekhtml_parser_t *parser, void *cbdata){ + parser->cbdata = cbdata; +} + +static void +ekhtml_parser_startendcb_add(ekhtml_parser_t *parser, const char *tag, + ekhtml_starttag_cb_t startcb, + ekhtml_endtag_cb_t endcb, + int isStart) +{ + ekhtml_tag_container *cont; + ekhtml_string_t lookup_str; + char *newtag, *cp; + unsigned int taglen; + hnode_t *hn; + + if(!tag){ + if(isStart) + parser->startcb_unk = startcb; + else + parser->endcb_unk = endcb; + return; + } + + + newtag = _strdup(tag); + for(cp=newtag; *cp; cp++) + *cp = toupper(*cp); + + taglen = cp - newtag; + + /* First see if the container already exists */ + lookup_str.str = newtag; + lookup_str.len = taglen; + + if((hn = hash_lookup(parser->startendcb, &lookup_str))){ + cont = hnode_get(hn); + free(newtag); + if(isStart) + cont->startfunc = startcb; + else + cont->endfunc = endcb; + } else { + ekhtml_string_t *set_str; + + cont = malloc(sizeof(*cont)); + if(isStart){ + cont->startfunc = startcb; + cont->endfunc = NULL; + } else { + cont->startfunc = NULL; + cont->endfunc = endcb; + } + set_str = malloc(sizeof(*set_str)); + *set_str = lookup_str; + hash_alloc_insert(parser->startendcb, set_str, cont); + } +} + +void ekhtml_parser_startcb_add(ekhtml_parser_t *parser, const char *tag, + ekhtml_starttag_cb_t cback) +{ + ekhtml_parser_startendcb_add(parser, tag, cback, NULL, 1); +} + +void ekhtml_parser_endcb_add(ekhtml_parser_t *parser, const char *tag, + ekhtml_endtag_cb_t cback) +{ + ekhtml_parser_startendcb_add(parser, tag, NULL, cback, 0); +} + + +static hash_val_t ekhtml_string_hash(const void *key){ + const ekhtml_string_t *s = key; + hash_val_t res = 5381; + const char *str = s->str; + size_t len = s->len; + int c; + + while(len--){ + c = str[len]; + res = ((res << 5) + res) + c; /* res * 33 + c */ + } + return res; +} + +static int ekhtml_string_comp(const void *key1, const void *key2){ + const ekhtml_string_t *s1 = key1, *s2 = key2; + + if(s1->len == s2->len) + return memcmp(s1->str, s2->str, s1->len); + return 1; +} + +void ekhtml_parser_destroy(ekhtml_parser_t *ekparser){ + hnode_t *hn; + hscan_t hs; + + hash_scan_begin(&hs, ekparser->startendcb); + while((hn = hash_scan_next(&hs))){ + ekhtml_string_t *key = (ekhtml_string_t *)hnode_getkey(hn); + ekhtml_tag_container *cont = hnode_get(hn); + + hash_scan_delete(ekparser->startendcb, hn); + free((char *)key->str); + free(key); + free(cont); + } + + hash_destroy(ekparser->startendcb); + ekhtml_parser_starttag_cleanup(ekparser); + free(ekparser->buf); + free(ekparser); +} + +ekhtml_parser_t *ekhtml_parser_new(void *cbdata){ + ekhtml_parser_t *res; + + res = malloc(sizeof(*res)); + res->datacb = NULL; + res->startendcb = hash_create(HASHCOUNT_T_MAX, ekhtml_string_comp, + ekhtml_string_hash); + res->cbdata = cbdata; + res->startcb_unk = NULL; + res->endcb_unk = NULL; + res->commentcb = NULL; + res->buf = NULL; + res->nalloced = 0; + res->nbuf = 0; + res->freeattrs = NULL; + res->state.state = EKHTML_STATE_NONE; + res->state.state_data = NULL; + + /* Start out with a buffer of 1 block size */ + ekhtml_buffer_grow(res); + return res; +} + -- cgit v1.2.3