-
Notifications
You must be signed in to change notification settings - Fork 14
/
seg-PartsOfSpeech-stanford
157 lines (125 loc) · 5.62 KB
/
seg-PartsOfSpeech-stanford
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
#!/bin/bash
#
# /usr/local/bin/seg-PartsOfSpeech-stanford
#
# Calls PartsOfSpeech-StanfordNLP-02.py
#
# Written by FFS 2014-07-18
#
# Changelog
#
# 2014-08-12 Forked from seg-NER
#
#---------------------------------------------------------------------
# Parsing script and primary tag
ParseScript=PartsOfSpeech-StanfordNLP-02.py
PTAG=POS_02
SRC=seg # Source file type
TMP=pos # Temporary file type
TAR=seg # Target file type
SCRIPT=`basename $0`
# Help screen
if [ "$1" = "-h" -o "$1" = "--help" -o "$1" = "help" ]
then echo -e "\n\t$SCRIPT <date or daysago> [<partially matching filename>] [clobber]\n"
echo -e "\tTag parts of speech in seg files using Stanford-POS servers.\n"
echo -e "\tProcess the .$SRC files from seven days ago:\n"
echo -e "\t\t$SCRIPT 7 \n"
echo -e "\tProcess only Aljazeera files from a given date, removing any pre-existing $PTAG codes:\n"
echo -e "\t\t$SCRIPT 2006-12-28 Aljazeera clobber\n"
echo -e "\tUse a for loop to process a series of days (see daysago 2004-06-01) -- _ matches all files:\n"
echo -e "\t\tsweep 2013-01-01 ; for i in {1..366} ; do $SCRIPT here ; sweep + 1 ; done\n"
echo -e "\tThe files are processed in ca:/sweep, the master tree, from which they will be sync'd.\n"
echo -e "\tCodebook:\n"
echo -e "\t20140731135735.813|20140731135738.049|POS_02|That/DT|'s/VBZ|never/RB|really/RB|been/VBN|possible/JJ|./.|"
echo -e "\tStart time|End time|Primary tag(|Word/POS category)*\n"
exit
fi
# Start time
START="$(date +%s)"
# Hostname
HOST="$( hostname -s )"
# The POS servers are currently only on cartago
if [ "$HOST" != "cartago" ] ; then echo -e "\n\tThe POS servers are currently running only on cartago.\n" ; exit ; fi
# Get the date to work on (today's date may change while the script is running)
if [ "$1" = "here" ] ; then DAY="$( pwd )" DAY=${DAY##*/}
elif [ "$(echo $1 | grep '[^0-9]')" = "" ] # if the second parameter is an integer
then DAY="$(date -d "-$1 day" +%F)"
else DAY="$1"
fi
# Sanity check
if [ "$(date -d "$DAY" 2>/dev/null)" = "" ]
then echo -e "\n\t$SCRIPT [<date> or #] [<partially matching filename>] [clobber]\n" ; exit
fi
# Partial file name?
if [ -z "$2" ]
then NAM="_"
else NAM="$2"
fi ; NAM=""$DAY"*"$NAM""
# Generate the date-dependent portion of the path
DDIR="$(date -d "$DAY" +%Y)/$(date -d "$DAY" +%Y-%m)/$(date -d "$DAY" +%F)"
# Base directory
if [ "$HOST" = "roma" ] ; then SDIR=/tv/$DDIR ; else SDIR=/sweep/$DDIR ; fi
# Define the trouble log and e-mail recipients
FAILED="/tmp/$SCRIPT.log"
TO=`cat /usr/local/bin/e-mail`
# File counter
NUM=0
# Welcome
echo -e "\n\tParts of speech tagging with $ParseScript in seg files on $DAY at $( date )\n"
# Process each text file in turn
for FIL in $( ls -1 $SDIR/$NAM*.$SRC 2> /dev/null ); do
# Strip path and extension
FIL=${FIL##*/} FIL=${FIL%.*}
# Skip KMEX
if [[ $FIL == *_KMEX_* ]] ; then continue ; fi
# Remove any temporary file
rm -f $SDIR/$FIL.$TMP
# Fix TOP and END tags if needed -- also removes blank lines
fixEND $SDIR/$FIL.$TAR
# Check for existing $PTAG tags
if [ "$( egrep ^"$PTAG" $SDIR/$FIL.$TAR )" != "" ] ; then
if [ "$3" = "clobber" ] ; then
# Tweak as needed to identify the version considered up to date
if [ "$( egrep -m1 $ParseScript $SDIR/$FIL.$TAR | grep $PTAG 2>/dev/null )" != "" ]
then echo -e "\t\t\t$ParseScript annotation $PTAG present in $FIL.$TAR -- skipping" ; continue
else echo -en "\t\t\tRe-annotating $PTAG with $ParseScript $FIL.$SRC"
sed -i "/$PTAG/d" $SDIR/$FIL.$SRC
fi
else echo -e "\t\t\t$ParseScript $PTAG completed in $FIL.$TAR" ; continue
fi
else echo -en "\t\t\tAnnotating $PTAG with $ParseScript $FIL.$SRC"
fi
# Get the size of the source file
S0="$( stat --format=%s $SDIR/$FIL.$SRC 2>/dev/null )"
# Background the annotation process to catch hangs
$ParseScript $SDIR/$FIL.$SRC > $SDIR/$FIL.$TMP &
# Get the PID and the start time of the annotation
PID=$! AGE="$( date +%s )" ; n=0 ; s=0.1
# Wait a few seconds for the temporary file to start growing
while [ ! -s $SDIR/$FIL.$TMP -a $n -lt 200 ] ; do sleep $s
tput cr ; echo -n $( echo "$n * $s" | bc | cut -d"." -f1 ) ; n=$[n+1]
done ; S1=0 S2=0 n=0 ; tput sc
# Terminate if the file stops growing (see margin counter for files that enter this loop)
while ps -p $PID > /dev/null ; do
S1="$( stat --format=%s $SDIR/$FIL.$TMP 2>/dev/null )" ; sleep 1 ; NOW="$(date +%s)" ; LASTED="$[NOW-$AGE]"
S2="$( stat --format=%s $SDIR/$FIL.$TMP 2>/dev/null )" n=$[n+1] ; tput cr ; tput ht ; echo -n $n
if [ "$S1" -eq "$S2" -a $n -gt 30 ] ; then kill $PID ; rm -f $SDIR/$FIL.$TMP
echo -e "\n\t`date +%F\ %H:%M` \t${SCRIPT%.*} \tNo grow \t$LASTED secs \t$FIL.$TMP" | tee -a $FAILED.$( date +%F )
fi
done ; NOW="$(date +%s)" LASTED="$[NOW-$AGE]" LASTED=$(date -ud "+$LASTED seconds"\ $(date +%F) +%M:%S)
# Check the temporary file
S1="$( stat --format=%s $SDIR/$FIL.$TMP 2>/dev/null )"
if [ "$S1" -gt "$S0" -a "$( tail -n1 $SDIR/$FIL.$TMP | grep END )" != "" ]
then tput rc ; tput ht ; tput ht ; echo "$LASTED"
mv $SDIR/$FIL.$TMP $SDIR/$FIL.$TAR
else echo -e "\n\t\t\tFailed annotation -- please check $FIL.$SRC" | tee -a $FAILED.$( date +%F ) ; continue
fi
NUM=$[NUM+1]
done
# Sanity check
if [ "$FIL" = "" ] ; then echo -e "\tUse a partially matching file name -- leave out the date and the extension.\n" ; exit ; fi
# Completion time
NOW="$(date +%s)" LASTED="$[NOW - $START]" LASTED=$(date -ud "+$LASTED seconds"\ $(date +%F) +%M:%S)
# Receipt
echo -e "\n\tCompleted annotating $NUM $SRC files to $TAR with $ParseScript in $SDIR ($LASTED)\n"
# EOF