-
Notifications
You must be signed in to change notification settings - Fork 16
Expand file tree
/
Copy pathWordFinder.py
More file actions
137 lines (115 loc) · 4.88 KB
/
WordFinder.py
File metadata and controls
137 lines (115 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#TextMining Project: KeyWord Search
#This script will search through a given paper or papers for the occurences of several keywords
#it will then tell the user the most relevant paper based on the maximum number of occurences of a keyword or keywords
#Claire Kincaid
#April 18, 2016 (revised for MiniProject 5)
import doctest
import string
import random
import math
from random import randint
import requests
Knight_full_text = requests.get('http://www.gutenberg.org/files/55708/55708-8.txt').text
America_full_text = requests.get('http://www.gutenberg.org/files/55713/55713-0.txt').text
Wolf_full_text = requests.get('http://www.gutenberg.org/files/55720/55720-8.txt').text
#first remove all punctuation from the texts
s1 = Knight_full_text # do you just put the whole text here?
#out1 = s1.translate(string.punctuation)
s2 = America_full_text # Sample string
#out2 = s2.translate(string.punctuation)
s3 = Wolf_full_text
exclude = set(string.punctuation)
s1 = ''.join(ch for ch in s1 if ch not in exclude)
s2 = ''.join(ch for ch in s2 if ch not in exclude)
s3 = ''.join(ch for ch in s3 if ch not in exclude)
#make all letters lowercase
Knight_text = str.lower(s1)
America_text = str.lower(s2)
Wolf_text = str.lower(s3)
whole_text = Knight_text + America_text + Wolf_text
#print(whole_text)
word_list = whole_text.split(' ')
#make an index of all words in Herland & Crusoe
new_dict = {}
""" The following code creates a dictionary (new_dict) that contains all of the
words in both texts as keys and then the word that
follows the key word stored in a dictionary. If the word already exists in the
dictionary the code simply adds the following word in the list to the dictionary.
"""
for index,word in enumerate(word_list[:-1]):
if word not in new_dict:
new_dict[word] = [word_list[index + 1]]
else:
new_dict[word].append(word_list[index + 1])
#print(new_dict)
def quote(data,length_quote):
""" This function generated a random sentence/quote from the dictionary
created above. this code randomly chooses an index and then finds the key
with that index in the dictionary and then randomly chooses a value of that
key. That value is then added to a string and then becomes the next key.
This process is repeted until the desired length of quote is reached.
"""
new_string = ''
num_words = 0
x = random.choice(list(data.keys()))
while num_words < length_quote:
if num_words > 0:
new_string += ' '
#print(new_string)
next_word = random.choice(data[x])
new_string = new_string + next_word
x = next_word
num_words = num_words + 1
new_string += '."'
#print(new_string)
return new_string
Knight_America_Wolf = quote(new_dict, 10000)
print(Knight_America_Wolf)
def make_data(data):
"""Takes a string, removes all punctuation, makes all letters lowercase and puts words of string into a list
>>> make_data("I'm hilarious")
['im', 'hilarious']
"""
listdata = data.split(data)
return listdata
def word_count(data):
"""Takes a string, uses make_data to turn it into an analyzable list
creates a dictionary that counts all words within that list
>>> word_count("I'm hilarious")
{'im': 1, 'hilarious': 1}
"""
words = dict()
for word in make_data(data):
words[word] = words.get(word, 0) + 1
return words
def word_find(data, keyword):
""" Takes a string, uses word_count to create dict counting all words in string
returns frequency of word specified as a keyword
>>> word_find("I'm hilarious", "hilarious")
1
"""
hist = word_count(data)
return hist.get(keyword, 0)
def multi_keywords_find(data, keywords):
""" Takes a string data and a list of keywords and returns dict w/ word count of those words
>>> multi_keywords_find("I'm hilarious", ['im', 'hilarious'])
{'im': 1, 'hilarious': 1}
"""
all_keywords = dict()
for i in keywords:
all_keywords[i] = (word_find(data, i))
return all_keywords
def multi_paper_word_find(data, keyword):
"""takes string keyword, uses word_find to find the occurences of keyword in three datasets in a list
returns dictionary of papers in order of highest occurences of word to lowest"""
data_keyword = dict()
for i in data:
data_keyword[i] = word_find(i, keyword)
return data_keyword
def relevance(data, keyword):
data_keyword = multi_paper_word_find(data, keyword)
most_relevant = max(data_keyword.get(data, 0))
return most_relevant
keywords = ['kingdom', 'wild', 'cold', 'learning', 'war', 'violence', 'journey', 'love','hero','freedom','morning','people','interested','the']
print (multi_keywords_find(Knight_America_Wolf, keywords))
print (relevance(Knight_America_Wolf, keywords[1]))