-
Notifications
You must be signed in to change notification settings - Fork 4
/
window_fn.ml
539 lines (493 loc) · 19.4 KB
/
window_fn.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
(* Flexible window processing, mainly for the sake of
digital signal processing (filtering, etc.)
Window processing is uniform in the backend: that's why it is a functor
*)
(* The identity function that makes CPS convenient *)
let (let-) c k = c k
module type cde_ex = module type of Cde_ex
module type stream_raw = module type of Stream_raw
module Make(C: cde_ex)(Raw: stream_raw with
type 'a exp = 'a C.exp and
type 'a stm = 'a C.stm and
type 'a mut = 'a C.mut and
type 'a arr = 'a C.arr and
type 'a tbase = 'a C.tbase) = struct
(* Open C only locally; otherwise its && etc. operators interfere
Local opens also makes the code generation places explicit
*)
type 'a exp = 'a C.exp
type 'a stm = 'a C.stm
type 'a mut = 'a C.mut
type 'a arr = 'a C.arr
type 'a tbase = 'a C.tbase
type 'a stream = 'a Raw.stream
type 'a cstream = 'a exp stream
open Raw
(* Note on performance
A key to an easily vectorizable code is to generate loops such that:
- boundaries are static or at least simple
- indexing is simple: a[i] is simple, a[i+1] may be too,
but a[c-i] is not
As another constraint, since the current element is available immediately,
it is better to write the reduction as
x[i] `op` x[i-1] ... `op` [x-m+1]
(The operation op may not have a unit),
so that the current element x[i] can be used to initialize the accumulator.
Therefore, to make sure the history buffer is accessed as just buffer[i]
where i is the loop index (in the order of the loop index),
the buffer should be filled *backwards*: the most recent element should have
the smallest index.
*)
module type window = sig
type e (* element type: base type *)
type t (* window type *)
val etyp : e tbase
val size : int (* statically known, >0 *)
val make_stream : ?padding:bool -> e exp stream -> t stream
(* General weghted reduction of the window: essentially map-reduce
reduce_gen op extr
The operation extr i e is mapping: maps the i-th element e of the window
to a value domain a, and op is an associative operation on that domain.
*)
val reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm
end
(* introduce two convenient specializations of reduce_gen *)
module type window_ex = sig
include window
(* reduce the window with an associative operation
When used as an argument of map_raw, effectively computes
signal[i] `op` signal[i-1] ... `op` signal[i-size+1]
*)
val reduce : (e exp -> e exp -> e exp) -> t ->
(e exp -> 'w stm) -> 'w stm
(*
When used as an argument of map_raw, effectively computes the
convolution
weight[0] * signal[i] +
weight[1] * signal[i-1] + ...
weight[size-1] * signal[i-size+1]
where + stands for the reduction op.
*)
(* array of weights, length = size *)
val dot : 'a tbase -> ('b -> 'a exp) -> 'b array ->
(e exp -> e exp -> e exp) -> (* reduction op, associative *)
('a exp -> e exp -> e exp) -> (* weight `multiplication' *)
t -> (e exp -> 'w stm) -> 'w stm
end
(* The extended operations can often be computed generically *)
module WinEx(W:window) = struct
include W
let reduce : (e exp -> e exp -> e exp) -> t ->
(e exp -> 'w stm) -> 'w stm = fun op t k ->
reduce_gen op (fun _ x -> x) t k
let dot : 'a tbase -> ('b -> 'a exp) -> 'b array ->
(e exp -> e exp -> e exp) -> (* reduction op, associative *)
('a exp -> e exp -> e exp) -> (* weight `multiplication' *)
t -> (e exp -> 'w stm) -> 'w stm =
fun typ lift weights op mul t k ->
assert (Array.length weights = size);
let open C in
new_static_array typ lift weights @@ fun wt ->
reduce_gen op (fun i x -> mul (array_get' wt i) x) t k
end
type 'a window = (module window_ex with type e = 'a)
(* makers of windows in various particular cases *)
(* The simplest window, of size 2. If slide = true, it slides by one,
otherwise, tumbles
*)
module Size2(S: sig type e val etyp: e tbase val slide: bool end) = struct
type e = S.e
let etyp = S.etyp
type t = {prev: e mut; curr: e exp}
let size = 2
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
if padding && S.slide then
(* If the window is sliding and initially padded by 0,
then it is always fully filled and we don't need inited.
*)
let- prev = Raw.initializing_ref (C.tbase_zero etyp) in
st |>
Raw.map_raw C.(fun e k -> k {prev; curr=e} @. (prev := e))
else (* general case *)
let- inited = Raw.initializing_ref (C.bool padding) in
let- prev = Raw.initializing_ref (C.tbase_zero etyp) in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref inited) (
k {prev; curr=e} @.
if S.slide then
prev := e
else
inited := bool false
) (
(prev := e) @.
(inited := bool true)
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {prev;curr} ->
op (mapper C.(int 0) curr) (mapper C.(int 1) (C.dref prev)) |> C.letl
let reduce : (e exp -> e exp -> e exp) -> t ->
('a exp -> 'w stm) -> 'w stm =
fun op t -> reduce_gen op (fun _ x -> x) t
let dot : 'a tbase -> ('b -> 'a exp) -> 'b array ->
(e exp -> e exp -> e exp) -> ('a exp -> e exp -> e exp) ->
t -> (e exp -> 'w stm) -> 'w stm =
fun _typ lift weights op mul {prev;curr} ->
assert (Array.length weights = 2);
C.(op (mul (lift weights.(0)) curr)
(mul (lift weights.(1)) (dref prev))) |> C.letl
end
(* XXX Potentially, there should be something like Size2 for smaller
sizes; just use ref cells rather than array
*)
(* ceil_power2 n is the least such m that m is an exact power of 2 and
n <= m
*)
let ceil_power2 : int -> int = fun n ->
let rec loop m = if m >= n then m else loop (2*m) in
loop 1
(* The general case of sliding window *)
module Sliding(S: sig type e val etyp: e tbase
val size: int val slide: int end) = struct
type e = S.e
let etyp = S.etyp
let size = S.size
let hist_size = size - 1
(* buff is the ring buffer containing the elements prior to curr: the history
idx is the index of the slot to write the current element afterwards
(that slot may contain the earliers element to evict,
if hist_size = buffer_len)
The buffer is filled *backwards*
We make the buffer a bit longer to make sure the best case of reduction
(with one loop) occurs more often.
*)
type t = {buffer: e arr; curr: e exp; idx: int exp}
let buffer_len = ceil_power2 (2*hist_size)
let buff_len1 = buffer_len - 1
let mask = C.int (buffer_len - 1)
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
let- len = Raw.initializing_ref
(C.int @@ if padding then hist_size else 0) in
let- idx = Raw.initializing_ref
(C.int @@
if padding then (buff_len1-hist_size) land buff_len1
else buff_len1)
in
let- buffer = Raw.initializing_arr etyp
(Array.make buffer_len C.(tbase_zero etyp)) in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref len < int hist_size) (
array_set buffer (dref idx) e @.
incr len @.
(idx := logand ((dref idx) - int 1) mask)
) (
k {buffer; idx = dref idx; curr=e} @.
array_set buffer (dref idx) e @.
(idx := logand ((dref idx) - int 1) mask) @.
if Stdlib.(S.slide > 1) then
len := dref len + int Stdlib.(1 - S.slide)
else
unit
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {buffer;curr;idx} k ->
let open C in
let- sum = newref (mapper (int 0) curr) in
if_ (idx <= int Stdlib.(buffer_len - 1 - hist_size)) (
(* this is a static loop and unrollable/vectorizable *)
for_ (int 1) ~upe:(int Stdlib.(hist_size+1)) (fun i ->
sum := op (dref sum)
(mapper i (array_get' buffer (idx + i)))
)
) (
for_ (idx+int 1) ~upe:(int buffer_len) (fun i ->
sum := op (dref sum)
(mapper (i - idx) (array_get' buffer i))
) @.
for_ (int 0) ~upe:(idx + int Stdlib.(hist_size-buffer_len +1)) (fun i ->
sum := op (dref sum)
(mapper (i - idx + int buffer_len)
(array_get' buffer i))
)
) @.
k (dref sum)
end
(* Alternative general case of a sliding window
For the history size hist_size, the needed buffer space
is 2*hist_size -1. However, we allocate 2*hist_size (and waste
one memory cell and occasional assignment to it) to avoid
branching.
The general idea is that writing into a buffer writes into two places
at once, separated by hist_size. That avoids the need to copy or rotate
the buffer. The usable window is always the sequence of consecutive
slots.
*)
module SlidingAlt(S: sig type e val etyp: e tbase
val size: int val slide: int end) = struct
type e = S.e
let etyp = S.etyp
let size = S.size
let hist_size = size - 1
(* buff is the buffer containing the elements prior to curr: the history.
The buffer is filled *backwards*.
idx is the index of the most recently written element, and ranges
0 through hist_size-1.
*)
type t = {buffer: e arr; curr: e exp; idx: int exp}
let buffer_len = 2*hist_size
let hs1 = hist_size - 1
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
let- idx = Raw.initializing_ref (C.int hs1) in
let- buffer = Raw.initializing_arr etyp
(Array.make buffer_len C.(tbase_zero etyp)) in
if padding && S.slide = 1 then (* window is always filled *)
st |>
Raw.map_raw C.(fun e k ->
k {buffer; idx = dref idx; curr=e} @.
decr idx @.
if1 (dref idx < int 0) (idx := int hs1) @.
array_set buffer (dref idx) e @.
array_set buffer (dref idx + int hist_size) e
)
else
let- len = Raw.initializing_ref
(C.int @@ if padding then hist_size else 0) in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref len < int hist_size) (
decr idx @.
if1 (dref idx < int 0) (idx := int hs1) @.
array_set buffer (dref idx) e @.
array_set buffer (dref idx + int hist_size) e @.
incr len
) (
k {buffer; idx = dref idx; curr=e} @.
decr idx @.
if1 (dref idx < int 0) (idx := int hs1) @.
array_set buffer (dref idx) e @.
array_set buffer (dref idx + int hist_size) e @.
if Stdlib.(S.slide > 1) then
len := dref len + int Stdlib.(1 - S.slide)
else
unit
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {buffer;curr;idx} k ->
let open C in
let- sum = newref (mapper (int 0) curr) in
for_ (int 0) ~upe:(int hist_size) (fun i ->
sum := op (dref sum)
(mapper (i+int 1) (array_get' buffer (idx + i)))
) @.
k (dref sum)
end
(* Really long and sliding-by-1 window.
The buffer is copied from-time-to-time, so the window
should be long enough to make it worthwhile
(copying occurs once in window-size steps).
The strong point: all loops have static bounds and are vectorizable.
*)
module SlidingMove(S: sig type e val etyp: e tbase
val size : int end) = struct
type e = S.e
let etyp = S.etyp
let size = S.size
(* buff is has twice the size.
idx is the index of the slot where curr (it to be) written.
The buffer is filled backwards! So,
idx+1 is the previous element, and idx+size-1 is the earliest
element in the window.
*)
type t = {buffer: e arr; idx: int exp; curr: e exp;}
let buffer_len = 2 * size
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
let- idx = Raw.initializing_ref
(C.int @@ if padding then buffer_len - size
else buffer_len-1) in
let- buffer = Raw.initializing_arr etyp
(Array.make buffer_len C.(tbase_zero etyp))
in
if padding then (* The window is filled and stays full: len = size-1 *)
st |>
Raw.map_raw C.(fun e k ->
if1 (dref idx < int 0) (
(* shift elements 0 .. size-2 to bufflen+1-size .. bufflen-1 *)
for_ (int 0) ~upe:(int Stdlib.(size-1)) (fun i ->
array_set buffer (i + int Stdlib.(buffer_len + 1 - size))
(array_get' buffer i)
)
@.
(idx := int Stdlib.(buffer_len - size))
) @.
k {buffer; idx = dref idx; curr=e} @.
array_set buffer (dref idx) e @.
decr idx
)
else
let- len = Raw.initializing_ref
(C.int @@ if padding then size-1 else 0) in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref len < int Stdlib.(size-1)) (
array_set buffer (dref idx) e @.
incr len @.
decr idx
) ( (* len is size-1: that many useful elements in the buffer *)
if1 (dref idx < int 0) (
(* shift elements 0 .. size-2 to bufflen+1-size .. bufflen-1 *)
for_ (int 0) ~upe:(int Stdlib.(size-1)) (fun i ->
array_set buffer (i + int Stdlib.(buffer_len + 1 - size))
(array_get' buffer i)
)
@.
(idx := int Stdlib.(buffer_len - size))
) @.
k {buffer; idx = dref idx; curr=e} @.
array_set buffer (dref idx) e @.
decr idx
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {buffer;idx;curr} k ->
let open C in
let- sum = newref (mapper (int 0) curr) in
for_ (int 1) ~upe:(int size) (fun i ->
sum := op (dref sum)
(mapper i (array_get' buffer (idx + i)))
) @.
k (dref sum)
end
(*
(* Really long and sliding-by-N window.
The buffer is copied from-time-to-time, so the window
should be long enough to make it worthwhile
(copying occurs once in window-size steps).
The strong point: all loops have static bounds and are vectorizable.
*)
module SlidingNMove(S: sig type e val etyp: e tbase
val size : int
val slide : int (* >= 1 *)
end) = struct
type e = S.e
let etyp = S.etyp
let size = S.size
let _ = assert (S.slide >= 1)
(* buff is has (slide+1) x size.
idx is the index of the first free slot in the buffer
So, the useful elements are
idx-size .. idx-1
*)
type t = {buffer: e arr; idx: int exp}
let buffer_len = (S.slide + 1) * size
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
let- len = Raw.initializing_ref (C.int @@ if padding then size-1 else 0) in
let- idx = Raw.initializing_ref (C.int @@ if padding then size-1 else 0) in
let- buffer = if padding then Raw.initializing_arr etyp (Array.make buffer_len C.(tbase_zero etyp))
else Raw.initializing_uarr etyp buffer_len in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref len < int Stdlib.(size-1)) (
array_set buffer (dref idx) e @.
incr len @.
incr idx
) ( (* len is size-1: how many useful elements in the buffer *)
if1 (dref idx >= int Stdlib.(buffer_len-S.slide+1)) (
begin
if Stdlib.(size-1 < memcpy_num) then
for_ (int 0) (int Stdlib.(size-2)) (fun i ->
array_set buffer i
(array_get' buffer (dref idx - int Stdlib.(size-1) + i))
)
else
blit buffer (dref idx - int Stdlib.(size-1)) buffer (int 0) (int Stdlib.(size-1))
end @.
(idx := int Stdlib.(size-1))
) @.
array_set buffer (dref idx) e @.
incr idx @.
(len := dref len - int Stdlib.(S.slide - 1)) @.
k {buffer; idx = dref idx}
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {buffer;idx} k ->
let open C in
let- sum = newref (tbase_zero etyp) in
let- ibeg = letl (idx - int size) in
for_ (int 0) (int Stdlib.(size-1)) (fun i ->
sum := op (dref sum)
(mapper i (array_get' buffer (ibeg + i)))
) @.
dref sum
end
(* Long, tumbling window
*)
module BigTumbling(S: sig type e val etyp: e tbase
val size: int end) = struct
type e = S.e
let etyp = S.etyp
let size = S.size
let hist_size = size - 1
(* buff contains the elements prior to curr: the history
*)
type t = {buffer: e arr; curr: e exp}
let buffer_len = hist_size
let make_stream : ?padding:bool -> e exp stream -> t stream =
fun ?(padding=false) st ->
let- idx = Raw.initializing_ref (C.int @@ if padding then hist_size else 0) in
let- buffer = if padding then Raw.initializing_arr etyp (Array.make buffer_len C.(tbase_zero etyp))
else Raw.initializing_uarr etyp buffer_len in
st |>
Raw.map_raw ~linear:false C.(fun e k ->
if_ (dref idx < int hist_size) (
array_set buffer (dref idx) e @.
incr idx
) (
k {buffer; curr=e} @.
(idx := int 0)
)
)
let reduce_gen : ('a exp -> 'a exp -> 'a exp) ->
(int exp -> e exp -> 'a exp) -> t -> ('a exp -> 'w stm) -> 'w stm =
fun op mapper {buffer;curr} ->
failwith "mada"
end
*)
(* size = taps, slide = 1+decimation (Only StreamIt) . Size should be at least 2 *)
(* the dispatcher *)
let make_window : type a. a tbase -> int -> int -> a window =
fun typ size slide ->
assert (size>1);
assert (slide>=1 && slide <= size);
let module Parm = struct type e = a let etyp = typ let size = size end in
if size = 2 then
(module Size2(struct include Parm let slide = slide=1 end))
(*
else if slide = size then
(module WinEx(BigTumbling(Parm)))
*)
else if slide = 1 && size > 20 then
(module WinEx(SlidingMove(Parm)))
(*
else if slide > 1 && size > 10 then
(module WinEx(SlidingNMove(struct
include Parm let slide=slide end)))
*)
else
(module WinEx(SlidingAlt(struct include Parm let slide=slide end)))
end