forked from burakbayramli/books
-
Notifications
You must be signed in to change notification settings - Fork 0
/
2009-04-07-ftrclass.diary
86 lines (77 loc) · 3.01 KB
/
2009-04-07-ftrclass.diary
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
% 2009-04-07-ftrclass.diary
% Actually try classifying on some real speech data
% [email protected] http://www.ee.columbia.edu/~dpwe/e6820/
% Read in the phoneset definition
K = labreadkey('timit61.phset');
% Define 6 broad classes covering the 61 phones in the set
brdclass=[ones(1,13)*1,ones(1,10)*2,ones(1,7)*3,ones(1,5)*4,[2 2],ones(1,20)*5,[6 6 6 6]];
% 1=stops 2=fricatives 3=nasals 4=liquids 5=vowels 6=silence/gaps
% Look at the labeling for one TIMIT file:
% load waveform
[d,sr] = wavread('mdpk0/sa1.wav');
% load hand-label definitions (t defines times and l defines labels,
% indexed into K)
[t,l] = labreadlab('mdpk0/sa1.phn',K);
subplot(211)
specgram(d,128,sr);
% Replace the time axis with the phone label boundaries
labplotlabs(t,l,K)
% Calculate cepstral coefficients for this example using Malcolm Slaney's
% routine
cp = mfcc(d,16000,100);
% Set up a vector of the times corresponding to each column of cp
% (100 Hz frame rate, but 256 pt window, so first window is at 0.008 s)
tt=0.008:.01:(length(d)/sr-0.018);
% Now figure out the labels that go with each feature vector
% by 'sampling' the label ranges read from file
ll=labsamplabs(tt,t,l);
% Using the label values in ll to index into brdclass gives us the
% broad class indices (1-6) above
subplot(212)
plot(brdclass(ll))
axis([0 length(ll) 0 7])
% Lines up with segments in spectrogram.
% Use utility functions to build up training and test sets of
% features and labels, as above
% Define a train data set
tdat=[];
tlab=[];
[tdat,tlab]=appenddata('mdpk0/sx153',tdat,tlab,K);
[tdat,tlab]=appenddata('mdpk0/sx243',tdat,tlab,K);
[tdat,tlab]=appenddata('mdpk0/sx333',tdat,tlab,K);
[tdat,tlab]=appenddata('mdpk0/sx423',tdat,tlab,K);
[tdat,tlab]=appenddata('mdpk0/sx63',tdat,tlab,K);
% And a test set
edat=[];
elab=[];
[edat,elab]=appenddata('mdpk0/sa1',edat,elab,K);
[edat,elab]=appenddata('mdpk0/sa2',edat,elab,K);
[edat,elab]=appenddata('mdpk0/si1053',edat,elab,K);
[edat,elab]=appenddata('mdpk0/si1683',edat,elab,K);
size(edat)
ans =
1131 13
size(tdat)
ans =
1453 13
% Try a classification based on the first 8 cepstra
[H,O,R]=doclassif(tdat(:,1:8),brdclass(tlab),edat(:,1:8),brdclass(elab));
Ep 1 lr=0.4 frame accuracy on trn = 78.6648% test = 72.5022%
Ep 2 lr=0.2 frame accuracy on trn = 77.7013% test = 66.7551%
Ep 3 lr=0.1 frame accuracy on trn = 79.3531% test = 71.176%
% Do it again
[H,O,R]=doclassif(tdat(:,1:8),brdclass(tlab),edat(:,1:8),brdclass(elab));
Ep 1 lr=0.4 frame accuracy on trn = 80.3166% test = 71.9717%
Ep 2 lr=0.2 frame accuracy on trn = 80.4542% test = 72.1485%
Ep 3 lr=0.1 frame accuracy on trn = 81.1425% test = 70.4686%
% Several percent variation in successive runs is not unusual
% Check confusion matrix
confus(brdclass(elab),R)
ans =
58 5 7 0 13 31
17 112 1 1 24 2
5 1 46 0 8 0
15 1 9 17 49 0
8 3 6 21 367 2
52 43 0 1 9 197
% Vowels and fricatives are distinct; silence and stops confused