#include "stdafx.h" #include "striphtml.h" #include "ekhtml.h" #include "entities.h" void starttag_cb (void *cbdata, ekhtml_string_t *tag, ekhtml_attr_t *attrs) { STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata; switch (tag->len) { case 1: switch (*(tag->str)) { case 'a': case 'A': { ekhtml_attr_t *attr = attrs; while (attr) { if (_strnicmp(attr->name.str, "href", attr->name.len)==0) { data->stack.push(strncpy((char*)mir_calloc(attr->val.len+1), attr->val.str, attr->val.len)); break; } } }break; case 'i': case 'I': data->buffer.append(" *"); break; case 'b': case 'B': data->buffer.append(" _"); break; } case 2: if (toupper(tag->str[0]) == 'B' && toupper(tag->str[1]) == 'R') data->buffer.append("\r\n"); break; case 3: if (_strnicmp(tag->str, "img", 3) == 0) { ekhtml_attr_t *attr = attrs; data->buffer.append("IMAGE [ "); while (attr) { if (_strnicmp(attr->name.str, "src", attr->name.len)==0) { data->buffer.append(attr->val.str, attr->val.len); break; } } data->buffer.append(" ] "); } break; } } void endtag_cb (void *cbdata, ekhtml_string_t *tag) { STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata; switch (tag->len) { case 1: switch (*(tag->str)) { case 'a': case 'A': if (data->stack.empty()) break; data->buffer.append(" [ "); data->buffer.append(data->stack.top()); mir_free(data->stack.top()); data->stack.pop(); data->buffer.append(" ] "); break; case 'i': case 'I': data->buffer.append("* "); break; case 'b': case 'B': data->buffer.append("_ "); break; } } } void data_cb (void *cbdata, ekhtml_string_t *text) { STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata; char* s = (char*) mir_calloc(text->len+1); decode_html_entities_utf8(s, text->str, text->len); if (!data->stack.empty()) { char *top = data->stack.top(); if (_stricmp(s, top)==0) { mir_free(top); data->stack.pop(); } } data->buffer.append(s); mir_free(s); } char * striphtml(char *html) { STRIPHTML_DATA data; ekhtml_string_t ekstring; ekstring.len = strlen(html); ekstring.str = html; data.buffer.clear(); data.buffer.reserve(ekstring.len); ekhtml_parser_t *parser = ekhtml_parser_new(&data); ekhtml_parser_datacb_set(parser, &data_cb); ekhtml_parser_startcb_add(parser, NULL, &starttag_cb); ekhtml_parser_endcb_add(parser, NULL, &endtag_cb); ekhtml_parser_feed(parser, &ekstring); ekhtml_parser_flush(parser, 1); while (!data.stack.empty()) { mir_free(data.stack.top()); data.stack.pop(); } ekhtml_parser_destroy(parser); char *s = mir_strdup(data.buffer.c_str()); data.buffer.erase(); return s; //ekhtml_parser_create( }