summaryrefslogtreecommitdiff
path: root/engparser.c
diff options
context:
space:
mode:
authorb0ric <b0risov.alexandr@rambler.ru>2009-08-03 00:50:48 +0300
committerb0ric <b0risov.alexandr@rambler.ru>2009-08-03 00:50:48 +0300
commit6bdf8387c130d797a16e85b402fe1ead6fdfac4e (patch)
tree4efa342ffaff0c93bee13648017893e5438184b9 /engparser.c
parente8349635329344f2c152d68c5dcaf65a7d385f9c (diff)
Basic console version of WordExtract project
Diffstat (limited to 'engparser.c')
-rw-r--r--engparser.c126
1 files changed, 126 insertions, 0 deletions
diff --git a/engparser.c b/engparser.c
new file mode 100644
index 0000000..2c3c77f
--- /dev/null
+++ b/engparser.c
@@ -0,0 +1,126 @@
+/* This file is a part of WordExtract project
+ *
+ * Copyright (C) 2009 Borisov Alexandr
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "engparser.h"
+#include "word.h"
+
+/*---------OPTIONS---------*/
+int lower_first_capital = 1;
+int excl_w_capital = 1;
+
+Parseoptions hyphen = {'-', 0, 1, 0, 1, 0};
+Parseoptions quote = {'\'', 0, 1, 1, 0, 0};
+
+static void del_first_n_last(char *word);
+
+/*
+ * Recieves: line terminated by "\n\0"
+ *
+ * Words with "-" we'll be processed as a whole word
+ * And also we suppose that each sentence starts with capital letter
+ * and other words in the middle of the sentence which starts with
+ * capital letter is name (peoples, towns, I, etc...).
+ * Single quoted sentences are not allowed. It's gramatically incorrect
+ */
+int parseengphrase(char *phrase)
+{
+ extern Parseoptions hyphen;
+ extern Parseoptions quote;
+ char word[WORDLENGTH] = {0};
+ int sentence_start = 1;
+ int i, k;
+
+ for (i = 0, k = 0; phrase[k] != '\0'; k++) {
+ if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-'))
+ {
+ word[i++] = phrase[k];
+ if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) {
+ word[i-1] = tolower(word[i-1]);
+ sentence_start = 0;
+ }
+ }
+ else {
+ if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen))
+ to_list(word);
+ sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?');
+ for (i = 0; i < WORDLENGTH; i++)
+ word[i] = 0;
+ i = 0;
+ }
+ }
+ return 0;
+}
+
+int parse_eng_word(char *word, Parseoptions opt)
+{
+ extern int excl_w_capital;
+ int ends = 0, sts = 0, mids = 0;
+ int symbolled = 0;
+ int capital = 0;
+ int wlength = 0;
+ int i;
+
+ wlength = strlen(word);
+ for (i = 0; word[i] != '\0'; i++) {
+ capital = capital||isupper(word[i]);
+ if (word[i] == opt.symbol) {
+ if (i == 0)
+ sts = 1;
+ else if (i == wlength-1)
+ ends = 1;
+ else
+ mids = 1;
+ }
+ if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) {
+ word[i] = '\0';
+ break;
+ }
+ }
+ if (excl_w_capital&&capital)
+ return WORD_NOT_INCLUDED;
+ symbolled = sts&&ends;
+ if (symbolled)
+ del_first_n_last(word);
+ if (opt.excl_symbolled&&symbolled)
+ return WORD_NOT_INCLUDED;
+ if (opt.excl_w_starting&&sts&&!symbolled)
+ return WORD_NOT_INCLUDED;
+ if (opt.excl_w_middle&&mids)
+ return WORD_NOT_INCLUDED;
+ if (opt.ecxl_w_ending&&ends&&!symbolled)
+ return WORD_NOT_INCLUDED;
+ return WORD_INCLUDED;
+}
+
+/*
+ * It deletes first and last symbol in the word
+ */
+static void del_first_n_last(char *word)
+{
+ int i, k;
+ int wlength;
+
+ wlength = strlen(word);
+ for (k = 0, i = 1; word[i] != '\''; i++, k++)
+ word[k] = word[i];
+ word[k] = '\0';
+}
+