-
Notifications
You must be signed in to change notification settings - Fork 1
/
parse.py
executable file
·117 lines (102 loc) · 3.25 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import sys
import re
def add_data(training, temp_feat):
'''
given the list of sentences, it assigns them all the same value
extracted from the product sentiment.
'''
one = temp_feat[0]
one = one.replace("}", "]")
try:
val = int(re.search('\[(.*?)\]', one).group(1))
except TypeError as tE:
print "Couldn't parse sentiment value"
sys.exit()
except IndexError as iE:
print "Index error"
print one
print re.findall('\[(.*)\]', one)
sys.exit()
except ValueError as vE:
print "Value Error"
re.match('\[(.*)\]', one)
print re.search('\[(.*)\]', one).groups()
#sys.exit()
return
except AttributeError as aE:
print "Attribute Error"
print one
print training_data
sys.exit()
for index in xrange(0, len(temp_feat)):
target = temp_feat[index]
remove = re.search("(.*##)", target)
#This will actually remove the ## part of the review as well
if not remove is None:
target = target.replace(remove.group(1), "").strip() #remove the head
training[target] = val
#print val
def val_to_polarity(training_data):
'''Given a training data, changes all vals (such as -2, -1, 0, 1, 4)
into polarity (-1, 0, 1).
'''
new_data = {}
for k, v in training_data.items():
if v == 1 or v == 0:
new_data[k] = 0
elif v > 1:
new_data[k] = 1
else:
new_data[k] = -1
return new_data
def read_txt_data(path, training_data):
'''
Given the path to the training file, it fills the training_data
dict with the appropriate sentence to sentiment pair.
'''
text_file = open(path, "r")
temp_features = []
new_sent_val = False
for line in text_file:
if "[t]" in line:
continue
if (len(re.findall(r"\[[\+|\-][0-9]\]+", line)) >= 1) and not new_sent_val: #new one
temp_features.append(line)
new_sent_val = True
continue
elif (len(re.findall(r"\[[\+|\-][0-9]\]+", line)) >= 1) and new_sent_val: #end
add_data(training_data, temp_features) #Do the work,
temp_features = []
temp_features.append(line)
new_sent_val = True
continue
if new_sent_val:
temp_features.append(line)
add_data(training_data, temp_features) #last one, corner case
def read_test_data(path):
'''
Reads the test files, returns a dict of number to sentences
'''
text_file = open(path, "r")
temp_dict = {}
keys_list = []
val_list = []
for line in text_file:
sep = line.split("\t")
#temp_dict[sep[0]] = sep[1].strip().replace("##", "")
keys_list.append(sep[0])
val_list.append(sep[1].strip().replace("##", ""))
return keys_list, val_list
#return temp_dict
def rangefy(v):
ret = None
if v == 0 or v == 1:
ret = 0
elif v > 1:
ret = 1
else:
ret = -1
return ret
#parse.read_test_data("sampleOutput/product1.txt")
#print len(training_data)
#print training_data