engparser.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

/* This file is a part of WordExtract project
 *
 * Copyright (C) 2009 Borisov Alexandr
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program. If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "engparser.h"
#include "word.h"

/*---------OPTIONS---------*/
int lower_first_capital = 1;
int excl_w_capital = 1;

Parseoptions hyphen = {'-', 0, 1, 0, 1, 0};
Parseoptions quote = {'\'', 0, 1, 1, 0, 0};

static void del_first_n_last(char *word);

/*
 * Recieves: line terminated by "\n\0"
 *
 * Words with "-" we'll be processed as a whole word
 *	And also we suppose that each sentence starts with capital letter 
 *	and other words in the middle of the sentence which starts with 
 *	capital letter is name (peoples, towns, I, etc...).
 * Single quoted sentences are not allowed. It's gramatically incorrect
 */
int parseengphrase(char *phrase)
{
 extern Parseoptions hyphen;
 extern Parseoptions quote;
 char word[WORDLENGTH] = {0};
 int sentence_start = 1;
 int i, k;

 for (i = 0, k = 0; phrase[k] != '\0'; k++) {
	 if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-'))
		{
		 word[i++] = phrase[k];
		 if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) {
			 word[i-1] = tolower(word[i-1]);
			 sentence_start = 0;
			}
		}
	 else {
		 if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen))
			to_list(word);
		 sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?');
		 for (i = 0; i < WORDLENGTH; i++)
			word[i] = 0;
		 i = 0;
	 }
 }
 return 0;
}

int parse_eng_word(char *word, Parseoptions opt)
{
 extern int excl_w_capital;
 int ends = 0, sts = 0, mids = 0;
 int symbolled = 0;
 int capital = 0;
 int wlength = 0;
 int i;
 
 wlength = strlen(word);
 for (i = 0; word[i] != '\0'; i++) {
	 capital = capital||isupper(word[i]);
	 if (word[i] == opt.symbol) {
		if (i == 0)
			sts = 1;
		else if (i == wlength-1)
			ends = 1;
		else
			mids = 1;
	 }
	 if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) {
		word[i] = '\0';
		break;
	 }
 }
 if (excl_w_capital&&capital)
	return WORD_NOT_INCLUDED;
 symbolled = sts&&ends;
 if (symbolled)
	del_first_n_last(word);
 if (opt.excl_symbolled&&symbolled)
	return WORD_NOT_INCLUDED;
 if (opt.excl_w_starting&&sts&&!symbolled)
	return WORD_NOT_INCLUDED;
 if (opt.excl_w_middle&&mids)
	return WORD_NOT_INCLUDED;
 if (opt.ecxl_w_ending&&ends&&!symbolled)
	return WORD_NOT_INCLUDED;
 return WORD_INCLUDED;
}

/*
 * It deletes first and last symbol in the word
 */
static void del_first_n_last(char *word)
{
 int i, k;
 int wlength;

 wlength = strlen(word);
 for (k = 0, i = 1; word[i] != '\''; i++, k++)
	word[k] = word[i];
 word[k] = '\0';
}