diff options
Diffstat (limited to 'engparser.c')
-rw-r--r-- | engparser.c | 126 |
1 files changed, 126 insertions, 0 deletions
diff --git a/engparser.c b/engparser.c new file mode 100644 index 0000000..2c3c77f --- /dev/null +++ b/engparser.c @@ -0,0 +1,126 @@ +/* This file is a part of WordExtract project + * + * Copyright (C) 2009 Borisov Alexandr + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <stdio.h> +#include <ctype.h> +#include <string.h> +#include "engparser.h" +#include "word.h" + +/*---------OPTIONS---------*/ +int lower_first_capital = 1; +int excl_w_capital = 1; + +Parseoptions hyphen = {'-', 0, 1, 0, 1, 0}; +Parseoptions quote = {'\'', 0, 1, 1, 0, 0}; + +static void del_first_n_last(char *word); + +/* + * Recieves: line terminated by "\n\0" + * + * Words with "-" we'll be processed as a whole word + * And also we suppose that each sentence starts with capital letter + * and other words in the middle of the sentence which starts with + * capital letter is name (peoples, towns, I, etc...). + * Single quoted sentences are not allowed. It's gramatically incorrect + */ +int parseengphrase(char *phrase) +{ + extern Parseoptions hyphen; + extern Parseoptions quote; + char word[WORDLENGTH] = {0}; + int sentence_start = 1; + int i, k; + + for (i = 0, k = 0; phrase[k] != '\0'; k++) { + if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-')) + { + word[i++] = phrase[k]; + if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) { + word[i-1] = tolower(word[i-1]); + sentence_start = 0; + } + } + else { + if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen)) + to_list(word); + sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?'); + for (i = 0; i < WORDLENGTH; i++) + word[i] = 0; + i = 0; + } + } + return 0; +} + +int parse_eng_word(char *word, Parseoptions opt) +{ + extern int excl_w_capital; + int ends = 0, sts = 0, mids = 0; + int symbolled = 0; + int capital = 0; + int wlength = 0; + int i; + + wlength = strlen(word); + for (i = 0; word[i] != '\0'; i++) { + capital = capital||isupper(word[i]); + if (word[i] == opt.symbol) { + if (i == 0) + sts = 1; + else if (i == wlength-1) + ends = 1; + else + mids = 1; + } + if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) { + word[i] = '\0'; + break; + } + } + if (excl_w_capital&&capital) + return WORD_NOT_INCLUDED; + symbolled = sts&&ends; + if (symbolled) + del_first_n_last(word); + if (opt.excl_symbolled&&symbolled) + return WORD_NOT_INCLUDED; + if (opt.excl_w_starting&&sts&&!symbolled) + return WORD_NOT_INCLUDED; + if (opt.excl_w_middle&&mids) + return WORD_NOT_INCLUDED; + if (opt.ecxl_w_ending&&ends&&!symbolled) + return WORD_NOT_INCLUDED; + return WORD_INCLUDED; +} + +/* + * It deletes first and last symbol in the word + */ +static void del_first_n_last(char *word) +{ + int i, k; + int wlength; + + wlength = strlen(word); + for (k = 0, i = 1; word[i] != '\''; i++, k++) + word[k] = word[i]; + word[k] = '\0'; +} + |