Basic console version of WordExtract project

author: b0ric <b0risov.alexandr@rambler.ru> 2009-08-03 00:50:48 +0300
committer: b0ric <b0risov.alexandr@rambler.ru> 2009-08-03 00:50:48 +0300
commit: 6bdf8387c130d797a16e85b402fe1ead6fdfac4e (patch)
tree: 4efa342ffaff0c93bee13648017893e5438184b9 /engparser.c
parent: e8349635329344f2c152d68c5dcaf65a7d385f9c (diff)
1 files changed, 126 insertions, 0 deletions
diff --git a/engparser.c b/engparser.c
new file mode 100644
index 0000000..2c3c77f
--- /dev/null
+++ b/engparser.c
@@ -0,0 +1,126 @@
+/* This file is a part of WordExtract project
+ *
+ * Copyright (C) 2009 Borisov Alexandr
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <ctype.h>
+#include <string.h>
+#include "engparser.h"
+#include "word.h"
+
+/*---------OPTIONS---------*/
+int lower_first_capital = 1;
+int excl_w_capital = 1;
+
+Parseoptions hyphen = {'-', 0, 1, 0, 1, 0};
+Parseoptions quote = {'\'', 0, 1, 1, 0, 0};
+
+static void del_first_n_last(char *word);
+
+/*
+ * Recieves: line terminated by "\n\0"
+ *
+ * Words with "-" we'll be processed as a whole word
+ *	And also we suppose that each sentence starts with capital letter 
+ *	and other words in the middle of the sentence which starts with 
+ *	capital letter is name (peoples, towns, I, etc...).
+ * Single quoted sentences are not allowed. It's gramatically incorrect
+ */
+int parseengphrase(char *phrase)
+{
+ extern Parseoptions hyphen;
+ extern Parseoptions quote;
+ char word[WORDLENGTH] = {0};
+ int sentence_start = 1;
+ int i, k;
+
+ for (i = 0, k = 0; phrase[k] != '\0'; k++) {
+	 if ((isalpha(phrase[k]))||(phrase[k] == '\'')||(phrase[k] == '-'))
+		{
+		 word[i++] = phrase[k];
+		 if ((lower_first_capital)&&(sentence_start)&&(phrase[k] != '\'')&&(phrase[k] != '-')) {
+			 word[i-1] = tolower(word[i-1]);
+			 sentence_start = 0;
+			}
+		}
+	 else {
+		 if ((*word != '\0')&&!parse_eng_word(word, quote)&&!parse_eng_word(word, hyphen))
+			to_list(word);
+		 sentence_start = sentence_start||(phrase[k] == '.')||(phrase[k] == '!')||(phrase[k] == '?');
+		 for (i = 0; i < WORDLENGTH; i++)
+			word[i] = 0;
+		 i = 0;
+	 }
+ }
+ return 0;
+}
+
+int parse_eng_word(char *word, Parseoptions opt)
+{
+ extern int excl_w_capital;
+ int ends = 0, sts = 0, mids = 0;
+ int symbolled = 0;
+ int capital = 0;
+ int wlength = 0;
+ int i;
+ 
+ wlength = strlen(word);
+ for (i = 0; word[i] != '\0'; i++) {
+	 capital = capital||isupper(word[i]);
+	 if (word[i] == opt.symbol) {
+		if (i == 0)
+			sts = 1;
+		else if (i == wlength-1)
+			ends = 1;
+		else
+			mids = 1;
+	 }
+	 if (mids&&opt.excl_word_after_symb&&!opt.excl_w_middle) {
+		word[i] = '\0';
+		break;
+	 }
+ }
+ if (excl_w_capital&&capital)
+	return WORD_NOT_INCLUDED;
+ symbolled = sts&&ends;
+ if (symbolled)
+	del_first_n_last(word);
+ if (opt.excl_symbolled&&symbolled)
+	return WORD_NOT_INCLUDED;
+ if (opt.excl_w_starting&&sts&&!symbolled)
+	return WORD_NOT_INCLUDED;
+ if (opt.excl_w_middle&&mids)
+	return WORD_NOT_INCLUDED;
+ if (opt.ecxl_w_ending&&ends&&!symbolled)
+	return WORD_NOT_INCLUDED;
+ return WORD_INCLUDED;
+}
+
+/*
+ * It deletes first and last symbol in the word
+ */
+static void del_first_n_last(char *word)
+{
+ int i, k;
+ int wlength;
+
+ wlength = strlen(word);
+ for (k = 0, i = 1; word[i] != '\''; i++, k++)
+	word[k] = word[i];
+ word[k] = '\0';
+}
+
author	b0ric <b0risov.alexandr@rambler.ru>	2009-08-03 00:50:48 +0300
committer	b0ric <b0risov.alexandr@rambler.ru>	2009-08-03 00:50:48 +0300
commit	6bdf8387c130d797a16e85b402fe1ead6fdfac4e (patch)
tree	4efa342ffaff0c93bee13648017893e5438184b9 /engparser.c
parent	e8349635329344f2c152d68c5dcaf65a7d385f9c (diff)