1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
/* This file is a part of WordExtract project
*
* Copyright (C) 2009 Borisov Alexandr
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#include <stdio.h>
#include <ctype.h>
#include <string.h>
#include "engparser.h"
#include "word.h"
/*---------OPTIONS---------*/
int lower_first_capital = 1;
int excl_w_capital = 1;
Parseoptions hyphen = {'-', 0, 1, 0, 1, 0};
Parseoptions quote = {'\'', 0, 1, 1, 0, 0};
static void del_first_n_last(char *word);
/*
* Recieves: line terminated by "\n\0"
*
* Words with "-" we'll be processed as a whole word
* And also we suppose that each sentence starts with capital letter
* and other words in the middle of the sentence which starts with
* capital letter is name (peoples, towns, I, etc...).
* Single quoted sentences are not allowed. It's gramatically incorrect
*/
int parseengphrase(char *phrase)
{
extern Parseoptions hyphen;
extern Parseoptions quote;
char word[WORDLENGTH] = {0};
int sentence_start = 1;
int i, k;
for (i = 0, k = 0; phrase[k] != '\0'; k++) {
if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-'))
{
word[i++] = phrase[k];
if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) {
word[i-1] = tolower(word[i-1]);
sentence_start = 0;
}
}
else {
if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen))
to_list(word);
sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?');
for (i = 0; i < WORDLENGTH; i++)
word[i] = 0;
i = 0;
}
}
return 0;
}
int parse_eng_word(char *word, Parseoptions opt)
{
extern int excl_w_capital;
int ends = 0, sts = 0, mids = 0;
int symbolled = 0;
int capital = 0;
int wlength = 0;
int i;
wlength = strlen(word);
for (i = 0; word[i] != '\0'; i++) {
capital = capital||isupper(word[i]);
if (word[i] == opt.symbol) {
if (i == 0)
sts = 1;
else if (i == wlength-1)
ends = 1;
else
mids = 1;
}
if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) {
word[i] = '\0';
break;
}
}
if (excl_w_capital&&capital)
return WORD_NOT_INCLUDED;
symbolled = sts&&ends;
if (symbolled)
del_first_n_last(word);
if (opt.excl_symbolled&&symbolled)
return WORD_NOT_INCLUDED;
if (opt.excl_w_starting&&sts&&!symbolled)
return WORD_NOT_INCLUDED;
if (opt.excl_w_middle&&mids)
return WORD_NOT_INCLUDED;
if (opt.ecxl_w_ending&&ends&&!symbolled)
return WORD_NOT_INCLUDED;
return WORD_INCLUDED;
}
/*
* It deletes first and last symbol in the word
*/
static void del_first_n_last(char *word)
{
int i, k;
int wlength;
wlength = strlen(word);
for (k = 0, i = 1; word[i] != '\''; i++, k++)
word[k] = word[i];
word[k] = '\0';
}
|