summaryrefslogtreecommitdiff
path: root/plugins/MirOTR/ekhtml/src/ekhtml.c
diff options
context:
space:
mode:
authorKirill Volinsky <mataes2007@gmail.com>2012-05-19 18:01:32 +0000
committerKirill Volinsky <mataes2007@gmail.com>2012-05-19 18:01:32 +0000
commitb1509f22892dc98057c750e7fae39ded5cea3b09 (patch)
tree6bdcc9379ae86339a67022b758575729d1304074 /plugins/MirOTR/ekhtml/src/ekhtml.c
parente7a776a6f5ab323cd9dd824e815846ef268fa7f1 (diff)
added MirOTR
git-svn-id: http://svn.miranda-ng.org/main/trunk@83 1316c22d-e87f-b044-9b9b-93d7a3e3ba9c
Diffstat (limited to 'plugins/MirOTR/ekhtml/src/ekhtml.c')
-rw-r--r--plugins/MirOTR/ekhtml/src/ekhtml.c394
1 files changed, 394 insertions, 0 deletions
diff --git a/plugins/MirOTR/ekhtml/src/ekhtml.c b/plugins/MirOTR/ekhtml/src/ekhtml.c
new file mode 100644
index 0000000000..f3697469f6
--- /dev/null
+++ b/plugins/MirOTR/ekhtml/src/ekhtml.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2002, Jon Travis
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * ekhtml: The El-Kabong HTML parser
+ * by Jon Travis (jtravis@p00p.org)
+ *
+ * El-Kabong: A speedy, yet forgiving, SAX-stylee HTML parser.
+ *
+ * The idea behind this parser is for it to use very little memory, and still
+ * be very speedy, while forgiving poorly written HTML.
+
+ * The internals of the parser consist of a small memory buffer which is able
+ * to grow when not enough information is known to correctly parse a tag.
+ * Given the typical layout of HTML, 4k should be plenty.
+ *
+ * The main state engine loops through this internal buffer, determining what
+ * the next state should be. Once this is known, it passes off a segment to
+ * the state handlers (starttag, endtag, etc.) to process. The segment
+ * handlers and the main state engine communicate via a few variables. These
+ * variables indicate whether or not the main engine should switch state,
+ * or successfully remove some data, etc. The segment handlers are
+ * guaranteed the same starting data (though not the same pointer) on each
+ * invocation until the state is changed. Thus, the segment handlers cannot
+ * use pointers into the main buffer -- they must use offsets.
+ *
+ * Some of the speed is gained from using character map data found in
+ * ekhtml_tables.h. I don't have any empirical data for this yet --
+ * it only sounds like it would be faster.. ;-)
+ *
+ * I'm always looking for ways to clean && speed up this code. Feel free
+ * to give feedback -- JMT
+ */
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <assert.h>
+
+#include "ekhtml_config.h"
+#include "ekhtml.h"
+#define EKHTML_USE_TABLES
+#include "ekhtml_tables.h"
+#define EKHTML_USE_PRIVATE
+#include "ekhtml_private.h"
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b)) ? (a) : (b))
+#endif
+
+
+/*
+ * ekhtml_buffer_grow: Grow the parser's internal buffer by a blocksize.
+ * NOTE: Calling the function has the potential to
+ * change the data buffer location. Do
+ * not rely on it's location!
+ *
+ * Arguments: Parser = Parser to grow
+ */
+
+static void ekhtml_buffer_grow(ekhtml_parser_t *parser){
+ size_t newsize;
+ char *newbuf;
+
+ newsize = parser->nalloced + EKHTML_BLOCKSIZE;
+
+ if((newbuf = realloc(parser->buf, newsize)) == NULL){
+ fprintf(stderr, "BAD! Can't allocate %d bytes in ekhtml_buffer_grow\n",
+ newsize);
+ fflush(stderr); /* Just in case someone changes the buffering scheme */
+ }
+
+ parser->buf = newbuf;
+ parser->nalloced = newsize;
+}
+
+/*
+ * parser_state_determine: Determine the next state that the main parser
+ * should have, by investigating up to the first
+ * 4 characters in the buffer.
+ *
+ * Arguments: startp = Starting data pointer
+ * endp = Pointer to first byte of 'out of range' data
+ *
+ * Return values: Returns one of EKHTML_STATE_* indicating the state that
+ * was found.
+ *
+ */
+
+static inline
+int parser_state_determine(const char *startp, const char *endp){
+ const char *firstchar;
+ int newstate;
+
+ assert(startp != endp);
+
+ if(*startp != '<')
+ return EKHTML_STATE_INDATA;
+
+ firstchar = startp + 1;
+ if(firstchar == endp)
+ return EKHTML_STATE_NONE;
+
+ newstate = EKCMap_EKState[(unsigned char)*firstchar];
+ if(newstate == EKHTML_STATE_NONE){
+ if(firstchar + 2 >= endp) /* Not enough data to evaluate */
+ return EKHTML_STATE_NONE;
+ if(*(firstchar + 1) == '-' && *(firstchar + 2) == '-')
+ return EKHTML_STATE_COMMENT;
+ else
+ return EKHTML_STATE_SPECIAL;
+ } else
+ return newstate;
+}
+
+
+int ekhtml_parser_flush(ekhtml_parser_t *parser, int flushall){
+ void **state_data = &parser->state.state_data;
+ char *buf = parser->buf, *curp = buf, *endp = buf + parser->nbuf;
+ int badp = -1, tmpstate = parser->state.state, didsomething = 0;
+
+ while(curp != endp){
+ char *workp = curp;
+
+ if(tmpstate == EKHTML_STATE_NONE){
+ tmpstate = parser_state_determine(workp, endp);
+ if(tmpstate == EKHTML_STATE_NONE) /* Not enough data yet */
+ break;
+ }
+
+ if(tmpstate == EKHTML_STATE_INDATA || tmpstate == EKHTML_STATE_BADDATA)
+ curp = ekhtml_parse_data(parser, workp, endp, tmpstate);
+ else if(endp - workp > 2){ /* All tags fall under this catagory */
+ switch(tmpstate){
+ case EKHTML_STATE_ENDTAG:
+ curp = ekhtml_parse_endtag(parser, state_data,
+ workp, endp, &badp);
+ break;
+ case EKHTML_STATE_STARTTAG:
+ curp = ekhtml_parse_starttag(parser, state_data,
+ workp, endp, &badp);
+ break;
+ case EKHTML_STATE_COMMENT:
+ curp = ekhtml_parse_comment(parser, state_data,
+ workp, endp, &badp);
+ break;
+ case EKHTML_STATE_SPECIAL:
+ curp = ekhtml_parse_special(parser, state_data,
+ workp, endp, &badp);
+ break;
+ default:
+ assert(!"Unimplemented state");
+ }
+ } else {
+ curp = NULL; /* Not enough data, keep going */
+ }
+
+ /* If one of the parsers said the data was bad, reset the state */
+ if(badp != -1){
+ tmpstate = badp;
+ badp = -1;
+ }
+
+ if(curp == NULL){ /* State needed more data, so break out */
+ curp = workp;
+ break;
+ }
+
+ if(workp != curp){ /* state backend cleared up some data */
+ didsomething = 1;
+ tmpstate = EKHTML_STATE_NONE;
+ assert(*state_data == NULL);
+ }
+ }
+
+ if(flushall){
+ /* Flush whatever we didn't use */
+ if(parser->datacb){
+ ekhtml_string_t str;
+
+ str.str = curp;
+ str.len = endp - curp;
+ parser->datacb(parser->cbdata, &str);
+ }
+ curp = endp;
+ didsomething = 1;
+ tmpstate = EKHTML_STATE_NONE; /* Clean up to an unknown state */
+ *state_data = NULL;
+ }
+
+ parser->state.state = tmpstate;
+
+ if(didsomething){
+ /* Shuffle the data back, based on where we ended up */
+ parser->nbuf -= curp - buf;
+ if(endp - curp){ /* If there's still any data to move */
+ memmove(buf, curp, endp - curp);
+ }
+ }
+ return didsomething;
+}
+
+void ekhtml_parser_feed(ekhtml_parser_t *parser, ekhtml_string_t *str){
+ size_t nfed = 0;
+
+ while(nfed != str->len){
+ size_t tocopy;
+
+ /* First see how much we can fill up our internal buffer */
+ tocopy = MIN(parser->nalloced - parser->nbuf, str->len - nfed);
+ memcpy(parser->buf + parser->nbuf, str->str + nfed, tocopy);
+ nfed += tocopy;
+ parser->nbuf += tocopy;
+ if(parser->nalloced == parser->nbuf){
+ /* Process the buffer */
+ if(!ekhtml_parser_flush(parser, 0)){
+ /* If we didn't actually process anything, grow our buffer */
+ ekhtml_buffer_grow(parser);
+ }
+ }
+ }
+}
+
+void ekhtml_parser_datacb_set(ekhtml_parser_t *parser, ekhtml_data_cb_t cb){
+ parser->datacb = cb;
+}
+
+void ekhtml_parser_commentcb_set(ekhtml_parser_t *parser, ekhtml_data_cb_t cb){
+ parser->commentcb = cb;
+}
+
+void ekhtml_parser_cbdata_set(ekhtml_parser_t *parser, void *cbdata){
+ parser->cbdata = cbdata;
+}
+
+static void
+ekhtml_parser_startendcb_add(ekhtml_parser_t *parser, const char *tag,
+ ekhtml_starttag_cb_t startcb,
+ ekhtml_endtag_cb_t endcb,
+ int isStart)
+{
+ ekhtml_tag_container *cont;
+ ekhtml_string_t lookup_str;
+ char *newtag, *cp;
+ unsigned int taglen;
+ hnode_t *hn;
+
+ if(!tag){
+ if(isStart)
+ parser->startcb_unk = startcb;
+ else
+ parser->endcb_unk = endcb;
+ return;
+ }
+
+
+ newtag = _strdup(tag);
+ for(cp=newtag; *cp; cp++)
+ *cp = toupper(*cp);
+
+ taglen = cp - newtag;
+
+ /* First see if the container already exists */
+ lookup_str.str = newtag;
+ lookup_str.len = taglen;
+
+ if((hn = hash_lookup(parser->startendcb, &lookup_str))){
+ cont = hnode_get(hn);
+ free(newtag);
+ if(isStart)
+ cont->startfunc = startcb;
+ else
+ cont->endfunc = endcb;
+ } else {
+ ekhtml_string_t *set_str;
+
+ cont = malloc(sizeof(*cont));
+ if(isStart){
+ cont->startfunc = startcb;
+ cont->endfunc = NULL;
+ } else {
+ cont->startfunc = NULL;
+ cont->endfunc = endcb;
+ }
+ set_str = malloc(sizeof(*set_str));
+ *set_str = lookup_str;
+ hash_alloc_insert(parser->startendcb, set_str, cont);
+ }
+}
+
+void ekhtml_parser_startcb_add(ekhtml_parser_t *parser, const char *tag,
+ ekhtml_starttag_cb_t cback)
+{
+ ekhtml_parser_startendcb_add(parser, tag, cback, NULL, 1);
+}
+
+void ekhtml_parser_endcb_add(ekhtml_parser_t *parser, const char *tag,
+ ekhtml_endtag_cb_t cback)
+{
+ ekhtml_parser_startendcb_add(parser, tag, NULL, cback, 0);
+}
+
+
+static hash_val_t ekhtml_string_hash(const void *key){
+ const ekhtml_string_t *s = key;
+ hash_val_t res = 5381;
+ const char *str = s->str;
+ size_t len = s->len;
+ int c;
+
+ while(len--){
+ c = str[len];
+ res = ((res << 5) + res) + c; /* res * 33 + c */
+ }
+ return res;
+}
+
+static int ekhtml_string_comp(const void *key1, const void *key2){
+ const ekhtml_string_t *s1 = key1, *s2 = key2;
+
+ if(s1->len == s2->len)
+ return memcmp(s1->str, s2->str, s1->len);
+ return 1;
+}
+
+void ekhtml_parser_destroy(ekhtml_parser_t *ekparser){
+ hnode_t *hn;
+ hscan_t hs;
+
+ hash_scan_begin(&hs, ekparser->startendcb);
+ while((hn = hash_scan_next(&hs))){
+ ekhtml_string_t *key = (ekhtml_string_t *)hnode_getkey(hn);
+ ekhtml_tag_container *cont = hnode_get(hn);
+
+ hash_scan_delete(ekparser->startendcb, hn);
+ free((char *)key->str);
+ free(key);
+ free(cont);
+ }
+
+ hash_destroy(ekparser->startendcb);
+ ekhtml_parser_starttag_cleanup(ekparser);
+ free(ekparser->buf);
+ free(ekparser);
+}
+
+ekhtml_parser_t *ekhtml_parser_new(void *cbdata){
+ ekhtml_parser_t *res;
+
+ res = malloc(sizeof(*res));
+ res->datacb = NULL;
+ res->startendcb = hash_create(HASHCOUNT_T_MAX, ekhtml_string_comp,
+ ekhtml_string_hash);
+ res->cbdata = cbdata;
+ res->startcb_unk = NULL;
+ res->endcb_unk = NULL;
+ res->commentcb = NULL;
+ res->buf = NULL;
+ res->nalloced = 0;
+ res->nbuf = 0;
+ res->freeattrs = NULL;
+ res->state.state = EKHTML_STATE_NONE;
+ res->state.state_data = NULL;
+
+ /* Start out with a buffer of 1 block size */
+ ekhtml_buffer_grow(res);
+ return res;
+}
+