-
Notifications
You must be signed in to change notification settings - Fork 14
/
Sentiment-annotate.py
182 lines (158 loc) · 7.97 KB
/
Sentiment-annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/usr/bin/python -W ignore
#
# This script reads a .seg file and parses each caption line for sentiment (polarity and subjectivity).
#
# First, it scores each sentence using the native sentiment detector in Pattern2.6
# https://github.com/clips/pattern and http://www.clips.ua.ac.be/pages/pattern-en#sentiment
#
# 20140710235636.492|20140710235648.904|SMT_01|0.1|0.45|different|0.0|0.6|very|0.2|0.3
# Start time|End time|Primary tag|Sentence polarity|Sentence subjectivity(|Word|Word polarity|Word subjectivity)*
#
# Second, it scores the words present in the SentiWordNet dictionary.
# http://sentiwordnet.isti.cnr.it/ and http://sentiwordnet.isti.cnr.it/docs/SWNFeedback.pdf
# cartago:/usr/share/pyshared/pattern/text/en/wordnet/SentiWordNet_3.0.0_20130122.txt
#
# 20140720230042.157|20140720230050.866|SMT_02|HOLLYWOOD|0.0|0.0|MOURNING|-0.625|0.625|MAVERICK|0.375|0.375|OVER|0.375
# Start time|End time|Primary tag(|Word|Word polarity|Word subjectivity)*
#
# Third, it scores the sentence using TextBlob 0.9-dev PatternAnalyzer sentiment detection, which may have a better lexicon
# http://textblob.readthedocs.org/en/dev/advanced_usage.html#advanced -- we seem to get the same scores as SMT_01
#
# 20140720230042.157|20140720230050.866|SMT_03|0.5|0.5
# Start time|End time|Primary tag|Sentence polarity|Sentence subjectivity
#
# Fourth, it scores the sentence using TextBlob 0.9-dev NaiveBayesAnalyzer, an NLTK classifier trained on a movie reviews corpus
# http://textblob.readthedocs.org/en/dev/advanced_usage.html#advanced -- these scores are very different
#
# 20140720230042.157|20140720230050.866|SMT_04|0.927279087353
# Start time|End time|Primary tag|Sentence positivity
#
# Direct the output to overwrite the input .seg file (using sponge) or to a new .smt file
# for FIL in `ls -1 /db/tv/2014/2014-07/2014-07-11/*seg` ; do python SentiWordNet.py $FIL > ${FIL%.*}.smt ; done
#
# Not all of these need to be activated at once -- in July-August 2014 we ran the first and second.
#
# Can be called by cc-integrate-rongda-segmentation and seg-Sentiment
#
# Written by FFS, 2014-07-18
#
# Changelog:
#
# 2014-08-13 Set #!/usr/bin/python -W ignore to turn off Unicode warnings
# 2014-08-02 Added sentiment detection SMT_03 and 04 from TextBlob
# 2014-08-02 Forked from CLiPS-03.py for a pure sentiment script
# 2014-07-27 Learned enough python to control the format, logic improved
# 2014-05-18 First version SentiWordNet.py, poor sentiment output format
#
# --------------------------------------------------------------------------------------------------
# User input
import sys, os.path
scriptname = os.path.basename(sys.argv[0])
filename = sys.argv[1]
# Help screen
if filename == "-h" :
print "".join([ "\n","\t","This is a production script for sentiment detection -- issue:","\n" ])
print "".join([ "\t","\t","python ",scriptname," $FIL.seg > $FIL.smt" ])
print "".join([ "\n","\t","or use the seg-Sentiment bash script for bulk processing.","\n" ])
quit()
# Libraries
import nltk, datetime, re
# For Pattern2.6 native sentiment detection
# http://www.clips.ua.ac.be/pages/pattern-en#sentiment
from pattern.en import sentiment
# For the SentiWordNet dictionary
# http://www.clips.ua.ac.be/pages/pattern-en#wordnet
from pattern.en import wordnet
from pattern.en import ADJECTIVE
# For TextBlob PatternAnalyzer sentiment detection
# http://textblob.readthedocs.org/en/dev/advanced_usage.html#advanced
from textblob import TextBlob
# For TextBlob NLTK sentiment detection
# http://textblob.readthedocs.org/en/dev/advanced_usage.html#advanced
from textblob.sentiments import NaiveBayesAnalyzer
# Counter
n = 0
# A. Get the lines from the file
with open(filename) as fp:
for line in fp:
# B. Split each line into fields
field = line.split("|")
# Pretty debug
# print('\n'.join('{}: {}'.format(*k) for k in enumerate(field)))
# C. Header and footer
if len(field[0]) != 18:
print line,
continue
# D. Program credit
if n == 0:
credit=["SMT_01|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=Pattern 2.6, ",scriptname,"|Source_Person=Tom De Smedt, FFS|Codebook=polarity, subjectivity"]
print "".join(credit)
credit=["SMT_02|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=SentiWordNet 3.0, ",scriptname,"|Source_Person=Andrea Esuli, FFS|Codebook=polarity, subjectivity"]
print "".join(credit)
# credit=["SMT_03|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=TextBlob 0.9-dev, ",scriptname,"|Source_Person=Steven Loria, FFS|Codebook=polarity, subjectivity"]
# print "".join(credit)
# credit=["SMT_04|",datetime.datetime.now().strftime("%Y-%m-%d %H:%M"),"|Source_Program=TextBlob 0.9-dev, NLTK, ",scriptname,"|Source_Person=Steven Loria, FFS|Codebook=positivity"]
# print "".join(credit)
n=1
# E. Write segment tags and other non-caption lines
if field[2] == "SEG":
print line,
continue
elif len(field[2]) != 3:
print line,
continue
# F. Get the text, clean leading chevrons -- if BOM, strip non-ascii, otherwise remove individually
try:
text = re.sub('^[>,\ ]{0,6}','', field[3])
if re.search("(\xef\xbf\xbd)", text): text = ''.join([x for x in text if ord(x) < 128])
text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','')
text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
ext = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
# print text
except IndexError:
print line
continue
# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
print line,
snt = ""
smt = ""
terms = ""
smt3 = ""
smt4 = ""
# G. You could split the text into sentences, but it breaks the adjacency of text and tags
# http://www.clips.ua.ac.be/pages/pattern-en
# tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={})
# from pattern.en import tokenize
# for sentence in tokenize(text):
# H. Pattern2.6 built-in sentiment detection by sentence and words
snt = sentiment(text)
for tup in sentiment(text).assessments:
words = " ".join(tup[0])
terms = "".join([terms,"|",words,"|",str(tup[1]),"|",str(tup[2])])
if snt != "": print "".join([field[0],"|",field[1],"|SMT_01|",str(snt[0]),"|",str(snt[1]),terms])
# I. Word loop for the SentiWordNet dictionary
try:
for word in nltk.word_tokenize(text):
weight = wordnet.synsets(word, ADJECTIVE)[0].weight
smt = "".join([smt,"|",word,"|",str(weight[0]),"|",str(weight[1])])
except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError): continue
if smt != "": print "".join([field[0],"|",field[1],"|SMT_02",smt])
continue
# J. TextBlob default PatternAnalyzer sentiment detection
try:
smt3 = TextBlob(text)
# Sentiment(polarity=0.13636363636363635, subjectivity=0.5)
except UnicodeDecodeError: continue
if smt3 != "": print "".join([field[0],"|",field[1],"|SMT_03|",str(smt3.sentiment[0]),"|",str(smt3.sentiment[1])])
# K. TextBlob NLTK sentiment detection
try:
smt4 = TextBlob(text, analyzer=NaiveBayesAnalyzer())
# Sentiment(classification='pos', p_pos=0.9272790873528883, p_neg=0.07272091264711199)
except UnicodeDecodeError: continue
if smt4 != "": print "".join([field[0],"|",field[1],"|SMT_04|",str(smt4.sentiment[1])])
# L. Close the file
fp.close()
# EOF