/* This file is a part of WordExtract project * * Copyright (C) 2009 Borisov Alexandr * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include "engparser.h" #include "word.h" /*---------OPTIONS---------*/ int lower_first_capital = 1; int excl_w_capital = 1; Parseoptions hyphen = {'-', 0, 1, 0, 1, 0}; Parseoptions quote = {'\'', 0, 1, 1, 0, 0}; static void del_first_n_last(char *word); /* * Recieves: line terminated by "\n\0" * * Words with "-" we'll be processed as a whole word * And also we suppose that each sentence starts with capital letter * and other words in the middle of the sentence which starts with * capital letter is name (peoples, towns, I, etc...). * Single quoted sentences are not allowed. It's gramatically incorrect */ int parseengphrase(char *phrase) { extern Parseoptions hyphen; extern Parseoptions quote; char word[WORDLENGTH] = {0}; int sentence_start = 1; int i, k; for (i = 0, k = 0; phrase[k] != '\0'; k++) { if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-')) { word[i++] = phrase[k]; if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) { word[i-1] = tolower(word[i-1]); sentence_start = 0; } } else { if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen)) to_list(word); sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?'); for (i = 0; i < WORDLENGTH; i++) word[i] = 0; i = 0; } } return 0; } int parse_eng_word(char *word, Parseoptions opt) { extern int excl_w_capital; int ends = 0, sts = 0, mids = 0; int symbolled = 0; int capital = 0; int wlength = 0; int i; wlength = strlen(word); for (i = 0; word[i] != '\0'; i++) { capital = capital||isupper(word[i]); if (word[i] == opt.symbol) { if (i == 0) sts = 1; else if (i == wlength-1) ends = 1; else mids = 1; } if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) { word[i] = '\0'; break; } } if (excl_w_capital&&capital) return WORD_NOT_INCLUDED; symbolled = sts&&ends; if (symbolled) del_first_n_last(word); if (opt.excl_symbolled&&symbolled) return WORD_NOT_INCLUDED; if (opt.excl_w_starting&&sts&&!symbolled) return WORD_NOT_INCLUDED; if (opt.excl_w_middle&&mids) return WORD_NOT_INCLUDED; if (opt.ecxl_w_ending&&ends&&!symbolled) return WORD_NOT_INCLUDED; return WORD_INCLUDED; } /* * It deletes first and last symbol in the word */ static void del_first_n_last(char *word) { int i, k; int wlength; wlength = strlen(word); for (k = 0, i = 1; word[i] != '\''; i++, k++) word[k] = word[i]; word[k] = '\0'; }