summaryrefslogtreecommitdiff
path: root/MirOTR/striphtml.cpp
blob: 3262f6d0ffaeb8d2f203c7a2410a29e9c31288de (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#include "stdafx.h"
#include "striphtml.h"
#include "ekhtml.h"
#include "entities.h"

void starttag_cb (void *cbdata, ekhtml_string_t *tag, ekhtml_attr_t *attrs) {
	STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata;
	switch (tag->len) {
		case 1:
			switch (*(tag->str)) {
				case 'a':
				case 'A':
					{
					ekhtml_attr_t *attr = attrs;
					while (attr) {
						if (_strnicmp(attr->name.str, "href", attr->name.len)==0) {
							data->stack.push(strncpy((char*)mir_calloc(attr->val.len+1), attr->val.str, attr->val.len));
							break;
						}
					}
					}break;
				case 'i':
				case 'I':
					data->buffer.append(" *");
					break;
				case 'b':
				case 'B':
					data->buffer.append(" _");
					break;
			}
		case 2:
			if (toupper(tag->str[0]) == 'B' && toupper(tag->str[1]) == 'R') 
				data->buffer.append("\r\n");
			break;
		case 3:
			if (_strnicmp(tag->str, "img", 3) == 0) {
				ekhtml_attr_t *attr = attrs;
				data->buffer.append("IMAGE [ ");
					while (attr) {
						if (_strnicmp(attr->name.str, "src", attr->name.len)==0) {
							data->buffer.append(attr->val.str, attr->val.len);
							break;
						}
					}
				data->buffer.append(" ] ");
			}
			break;
	}
}

void endtag_cb (void *cbdata, ekhtml_string_t *tag) {
	STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata;
	switch (tag->len) {
		case 1:
			switch (*(tag->str)) {
				case 'a':
				case 'A':
					if (data->stack.empty()) break;
					data->buffer.append(" [ ");
					data->buffer.append(data->stack.top());
					mir_free(data->stack.top());
					data->stack.pop();
					data->buffer.append(" ] ");
					break;
				case 'i':
				case 'I':
					data->buffer.append("* ");
					break;
				case 'b':
				case 'B':
					data->buffer.append("_ ");
					break;
			}
	}

}

void data_cb (void *cbdata, ekhtml_string_t *text) {
	STRIPHTML_DATA *data = (STRIPHTML_DATA *)cbdata;
	char* s = (char*) mir_calloc(text->len+1);
	decode_html_entities_utf8(s, text->str, text->len);

	if (!data->stack.empty()) {
		char *top = data->stack.top();
		if (_stricmp(s, top)==0) {
			mir_free(top);
			data->stack.pop();
		}
	}
	

	data->buffer.append(s);
	mir_free(s);
}

char * striphtml(char *html) {
	STRIPHTML_DATA data;
	ekhtml_string_t ekstring;

	ekstring.len = strlen(html);
	ekstring.str = html;

	data.buffer.clear();
	data.buffer.reserve(ekstring.len);

	ekhtml_parser_t *parser = ekhtml_parser_new(&data);
	ekhtml_parser_datacb_set(parser, &data_cb);
	ekhtml_parser_startcb_add(parser, NULL, &starttag_cb);
	ekhtml_parser_endcb_add(parser, NULL, &endtag_cb);

	ekhtml_parser_feed(parser, &ekstring);
	ekhtml_parser_flush(parser, 1);

	while (!data.stack.empty()) {
		mir_free(data.stack.top());
		data.stack.pop();
	}

	ekhtml_parser_destroy(parser);

	char *s = mir_strdup(data.buffer.c_str());
	data.buffer.erase();
	return s;
	//ekhtml_parser_create(
}