diff options
author | b0ric <b0risov.alexandr@rambler.ru> | 2009-08-18 20:45:05 +0300 |
---|---|---|
committer | b0ric <b0risov.alexandr@rambler.ru> | 2009-08-18 20:45:05 +0300 |
commit | 7742a666d5ac98ea3a976336cf60a5530ab29e73 (patch) | |
tree | f985cda18f15f8a13feb5c10d17c5cc7a952f23a /src/engparser.c | |
parent | e5165d9838d03963284710e6b957fe9a3b0e267a (diff) |
Diffstat (limited to 'src/engparser.c')
-rw-r--r-- | src/engparser.c | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/src/engparser.c b/src/engparser.c new file mode 100644 index 0000000..af511e5 --- /dev/null +++ b/src/engparser.c @@ -0,0 +1,118 @@ +/* + * This file is part of WordExtract. + * + * Copyright (C) 2009 Borisov Alexandr + * + * WordExtract is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * WordExtract is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with WordExtract. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include "mainwin.h" +#include "engparser.h" +#include "word.h" +#include "dict.h" + +int lower_first_capital = 1; +int excl_w_capital = 1; +Parseoptions hyphen = {'-', 0, 1, 0, 1, 0}; +Parseoptions quote = {'\'', 0, 1, 1, 0, 0}; + +/* It deletes first and last symbol in the word*/ +static void del_first_n_last(char *word); + +int parseengphrase(char *phrase) +{ + extern Parseoptions hyphen; + extern Parseoptions quote; + char word[WORDLENGTH] = {0}; + int sentence_start = 1; + int i, k; + + add_sentence(phrase); + for (i = 0, k = 0; phrase[k] != '\0'; k++) { + if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-')) + { + word[i++] = phrase[k]; + if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) { + word[i-1] = tolower(word[i-1]); + sentence_start = 0; + } + } + else { + if ((*word != '\0')&&!parse_eng_word(word, quote)&& + !parse_eng_word(word, hyphen)&&!is_in_dict(word, dict)) + words = add_word(words, word); + sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?'); + for (i = 0; i < WORDLENGTH; i++) + word[i] = 0; + i = 0; + } + } + return 0; +} + +int parse_eng_word(char *word, Parseoptions opt) +{ + extern int excl_w_capital; + int ends = 0, sts = 0, mids = 0; + int symbolled = 0; + int capital = 0; + int wlength = 0; + int i; + + wlength = strlen(word); + for (i = 0; word[i] != '\0'; i++) { + capital = capital||isupper(word[i]); + if (word[i] == opt.symbol) { + if (i == 0) + sts = 1; + else if (i == wlength-1) + ends = 1; + else + mids = 1; + } + if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) { + word[i] = '\0'; + break; + } + } + if (excl_w_capital&&capital) + return WORD_NOT_INCLUDED; + symbolled = sts&&ends; + if (symbolled) + del_first_n_last(word); + if (opt.excl_symbolled&&symbolled) + return WORD_NOT_INCLUDED; + if (opt.excl_w_starting&&sts&&!symbolled) + return WORD_NOT_INCLUDED; + if (opt.excl_w_middle&&mids) + return WORD_NOT_INCLUDED; + if (opt.excl_w_ending&&ends&&!symbolled) + return WORD_NOT_INCLUDED; + return WORD_INCLUDED; +} + +static void del_first_n_last(char *word) +{ + int i, k; + int wlength; + + wlength = strlen(word); + for (k = 0, i = 1; word[i] != '\''; i++, k++) + word[k] = word[i]; + word[k] = '\0'; +} + |