summaryrefslogtreecommitdiff
path: root/libs/libmosquitto/src/utf8_mosq.c
blob: 875c1c7f2755052e354906b5bdf3f027220f1e81 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
/*
Copyright (c) 2016-2019 Roger Light <roger@atchoo.org>

All rights reserved. This program and the accompanying materials
are made available under the terms of the Eclipse Public License v1.0
and Eclipse Distribution License v1.0 which accompany this distribution.
 
The Eclipse Public License is available at
   http://www.eclipse.org/legal/epl-v10.html
and the Eclipse Distribution License is available at
  http://www.eclipse.org/org/documents/edl-v10.php.
 
Contributors:
   Roger Light - initial implementation.
*/

#include "config.h"

#include <stdio.h>
#include "mosquitto.h"

int mosquitto_validate_utf8(const char *str, int len)
{
	int i;
	int j;
	int codelen;
	int codepoint;
	const unsigned char *ustr = (const unsigned char *)str;

	if(!str) return MOSQ_ERR_INVAL;
	if(len < 0 || len > 65536) return MOSQ_ERR_INVAL;

	for(i=0; i<len; i++){
		if(ustr[i] == 0){
			return MOSQ_ERR_MALFORMED_UTF8;
		}else if(ustr[i] <= 0x7f){
			codelen = 1;
			codepoint = ustr[i];
		}else if((ustr[i] & 0xE0) == 0xC0){
			/* 110xxxxx - 2 byte sequence */
			if(ustr[i] == 0xC0 || ustr[i] == 0xC1){
				/* Invalid bytes */
				return MOSQ_ERR_MALFORMED_UTF8;
			}
			codelen = 2;
			codepoint = (ustr[i] & 0x1F);
		}else if((ustr[i] & 0xF0) == 0xE0){
			// 1110xxxx - 3 byte sequence
			codelen = 3;
			codepoint = (ustr[i] & 0x0F);
		}else if((ustr[i] & 0xF8) == 0xF0){
			// 11110xxx - 4 byte sequence
			if(ustr[i] > 0xF4){
				/* Invalid, this would produce values > 0x10FFFF. */
				return MOSQ_ERR_MALFORMED_UTF8;
			}
			codelen = 4;
			codepoint = (ustr[i] & 0x07);
		}else{
			/* Unexpected continuation byte. */
			return MOSQ_ERR_MALFORMED_UTF8;
		}

		/* Reconstruct full code point */
		if(i == len-codelen+1){
			/* Not enough data */
			return MOSQ_ERR_MALFORMED_UTF8;
		}
		for(j=0; j<codelen-1; j++){
			if((ustr[++i] & 0xC0) != 0x80){
				/* Not a continuation byte */
				return MOSQ_ERR_MALFORMED_UTF8;
			}
			codepoint = (codepoint<<6) | (ustr[i] & 0x3F);
		}
		
		/* Check for UTF-16 high/low surrogates */
		if(codepoint >= 0xD800 && codepoint <= 0xDFFF){
			return MOSQ_ERR_MALFORMED_UTF8;
		}

		/* Check for overlong or out of range encodings */
		/* Checking codelen == 2 isn't necessary here, because it is already
		 * covered above in the C0 and C1 checks.
		 * if(codelen == 2 && codepoint < 0x0080){
		 *	 return MOSQ_ERR_MALFORMED_UTF8;
		 * }else
		*/
		if(codelen == 3 && codepoint < 0x0800){
			return MOSQ_ERR_MALFORMED_UTF8;
		}else if(codelen == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)){
			return MOSQ_ERR_MALFORMED_UTF8;
		}

		/* Check for non-characters */
		if(codepoint >= 0xFDD0 && codepoint <= 0xFDEF){
			return MOSQ_ERR_MALFORMED_UTF8;
		}
		if((codepoint & 0xFFFF) == 0xFFFE || (codepoint & 0xFFFF) == 0xFFFF){
			return MOSQ_ERR_MALFORMED_UTF8;
		}
		/* Check for control characters */
		if(codepoint <= 0x001F || (codepoint >= 0x007F && codepoint <= 0x009F)){
			return MOSQ_ERR_MALFORMED_UTF8;
		}
	}
	return MOSQ_ERR_SUCCESS;
}