Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gateware.iostream.IOStreamer: let o_stream transfer if i_stream not rdy #675

Draft
wants to merge 14 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 24 additions & 3 deletions software/glasgow/applet/interface/qspi_controller/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,20 @@ class _QSPICommand(enum.Enum, shape=4):


class QSPIControllerSubtarget(Elaboratable):
def __init__(self, *, ports, out_fifo, in_fifo, divisor, us_cycles):
def __init__(self, *, ports, out_fifo, in_fifo, divisor, us_cycles, sample_delay_half_clocks=0):
self._ports = ports
self._out_fifo = out_fifo
self._in_fifo = in_fifo

self._divisor = divisor
self._us_cycles = us_cycles
self._sample_delay_half_clocks = sample_delay_half_clocks

def elaborate(self, platform):
m = Module()

m.submodules.qspi = qspi = QSPIController(self._ports, use_ddr_buffers=True)
m.submodules.qspi = qspi = QSPIController(self._ports, use_ddr_buffers=True,
sample_delay_half_clocks = self._sample_delay_half_clocks)
m.d.comb += qspi.divisor.eq(self._divisor)

o_fifo = self._out_fifo.stream
Expand Down Expand Up @@ -248,8 +250,26 @@ def add_build_arguments(cls, parser, access, *, include_pins=True):
"-f", "--frequency", metavar="FREQ", type=int, default=1000,
help="set SCK frequency to FREQ kHz (default: %(default)s)")

parser.add_argument(
"-d", "--sample-delay", metavar="SAMPLE_DELAY", type=int, required=False,
help="Specify sample delay in units of half clock-cycles. (Default: frequency-dependent)")

def build(self, target, args):
self.mux_interface = iface = target.multiplexer.claim_interface(self, args)
divisor=int(target.sys_clk_freq // (args.frequency * 2000))
if divisor != 0:
actual_frequency = target.sys_clk_freq / divisor / 2
else:
actual_frequency = target.sys_clk_freq
if args.sample_delay is None:
if actual_frequency <= 24_000_000.1:
sample_delay = 0
elif actual_frequency <= 60_000_000.1:
sample_delay = 1
else:
sample_delay = 2
else:
sample_delay = args.sample_delay
return iface.add_subtarget(QSPIControllerSubtarget(
ports=iface.get_port_group(
sck=args.pin_sck,
Expand All @@ -258,8 +278,9 @@ def build(self, target, args):
),
out_fifo=iface.get_out_fifo(),
in_fifo=iface.get_in_fifo(auto_flush=False),
divisor=int(target.sys_clk_freq // (args.frequency * 2000)),
divisor=divisor,
us_cycles=int(target.sys_clk_freq // 1_000_000),
sample_delay_half_clocks = sample_delay,
))

async def run(self, device, args):
Expand Down
69 changes: 44 additions & 25 deletions software/glasgow/gateware/iostream.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def _filter_ioshape(direction, ioshape):
def _iter_ioshape(direction, ioshape, *args): # actually filter+iter
for name, item in ioshape.items():
if _filter_ioshape(direction, ioshape[name]):
yield tuple(arg[name] for arg in args)
yield (name, *(arg[name] for arg in args))


def _map_ioshape(direction, ioshape, fn): # actually filter+map
Expand Down Expand Up @@ -106,14 +106,17 @@ def i_stream_signature(ioshape, /, *, ratio=1, meta_layout=0):
"meta": meta_layout,
}))

def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0):
def __init__(self, ioshape, ports, /, *, ratio=1, init=None, meta_layout=0, sample_delay_half_clocks=0):
if ratio == 1:
assert (sample_delay_half_clocks % 2) == 0
assert isinstance(ioshape, (int, dict))
assert ratio in (1, 2)

self._ioshape = ioshape
self._ports = ports
self._ratio = ratio
self._init = init
self._sample_delay_half_clocks = sample_delay_half_clocks

super().__init__({
"o_stream": In(self.o_stream_signature(ioshape, ratio=ratio, meta_layout=meta_layout)),
Expand All @@ -124,10 +127,9 @@ def elaborate(self, platform):
m = Module()

if self._ratio == 1:
buffer_cls, latency = io.FFBuffer, 1
buffer_cls, latency = io.FFBuffer, 1 + self._sample_delay_half_clocks // 2
if self._ratio == 2:
# FIXME: should this be 2 or 3? the latency differs between i[0] and i[1]
buffer_cls, latency = SimulatableDDRBuffer, 3
buffer_cls, latency = SimulatableDDRBuffer, 2 + (self._sample_delay_half_clocks // 2) + (self._sample_delay_half_clocks % 2)

if isinstance(self._ports, io.PortLike):
m.submodules.buffer = buffer = buffer_cls("io", self._ports)
Expand All @@ -142,19 +144,19 @@ def elaborate(self, platform):
"oe": 1,
})), init=self._init)
with m.If(self.o_stream.valid & self.o_stream.ready):
for buffer_parts, stream_parts in _iter_ioshape("o", self._ioshape,
for _, buffer_parts, stream_parts in _iter_ioshape("o", self._ioshape,
buffer, self.o_stream.p.port):
m.d.comb += buffer_parts.o.eq(stream_parts.o)
m.d.comb += buffer_parts.oe.eq(stream_parts.oe)
for latch_parts, stream_parts in _iter_ioshape("o", self._ioshape,
for _, latch_parts, stream_parts in _iter_ioshape("o", self._ioshape,
o_latch, self.o_stream.p.port):
if self._ratio == 1:
m.d.sync += latch_parts.o.eq(stream_parts.o)
else:
m.d.sync += latch_parts.o.eq(stream_parts.o[-1])
m.d.sync += latch_parts.oe.eq(stream_parts.oe)
with m.Else():
for buffer_parts, latch_parts in _iter_ioshape("o", self._ioshape,
for _, buffer_parts, latch_parts in _iter_ioshape("o", self._ioshape,
buffer, o_latch):
if self._ratio == 1:
m.d.comb += buffer_parts.o.eq(latch_parts.o)
Expand All @@ -163,37 +165,49 @@ def elaborate(self, platform):
m.d.comb += buffer_parts.oe.eq(latch_parts.oe)

def delay(value, name):
delayed_values = []
for stage in range(latency):
next_value = Signal.like(value, name=f"{name}_{stage}")
m.d.sync += next_value.eq(value)
value = next_value
return value
delayed_values.append(next_value)
return delayed_values

i_en = delay(self.o_stream.valid & self.o_stream.ready &
self.o_stream.p.i_en, name="i_en")
meta = delay(self.o_stream.p.meta, name="meta")
i_en_delays = delay(self.o_stream.valid & self.o_stream.ready &
self.o_stream.p.i_en, name="i_en")
i_en = i_en_delays[-1]
meta = delay(self.o_stream.p.meta, name="meta")[-1]

# This skid buffer is organized as a shift register to avoid any uncertainties associated
# with the use of an async read memory. On platforms that have LUTRAM, this implementation
# may be slightly worse than using LUTRAM, and may have to be revisited in the future.
skid = Array(Signal(self.i_stream.payload.shape(), name=f"skid_{stage}")
for stage in range(1 + latency))
for skid_parts, buffer_parts in _iter_ioshape("i", self._ioshape, skid[0].port, buffer):
m.d.comb += skid_parts.i.eq(buffer_parts.i)
for name, skid_parts, buffer_parts in _iter_ioshape("i", self._ioshape, skid[0].port, buffer):
if self._sample_delay_half_clocks % 2:
m.d.comb += skid_parts.i[1].eq(buffer_parts.i[0])
i1_delayed = Signal.like(buffer_parts.i[1], name=f"{name}_i1_delayed")
m.d.sync += i1_delayed.eq(buffer_parts.i[1])
m.d.comb += skid_parts.i[0].eq(i1_delayed)
else:
m.d.comb += skid_parts.i.eq(buffer_parts.i)
m.d.comb += skid[0].meta.eq(meta)

skid_at = Signal(range(1 + latency))

with m.If(i_en):
for n_shift in range(latency):
m.d.sync += skid[n_shift + 1].eq(skid[n_shift])

with m.If(i_en & ~self.i_stream.ready):
# m.d.sync += Assert(skid_at != latency)
m.d.sync += skid_at.eq(skid_at + 1)
for n_shift in range(latency):
m.d.sync += skid[n_shift + 1].eq(skid[n_shift])
with m.Elif((skid_at != 0) & self.i_stream.ready):
with m.Elif((skid_at != 0) & ~i_en & self.i_stream.ready):
m.d.sync += skid_at.eq(skid_at - 1)

m.d.comb += self.i_stream.payload.eq(skid[skid_at])
m.d.comb += self.i_stream.valid.eq(i_en | (skid_at != 0))
m.d.comb += self.o_stream.ready.eq(self.i_stream.ready & (skid_at == 0))
m.d.comb += self.o_stream.ready.eq(self.i_stream.ready | ~((skid_at!=0) | Cat(*i_en_delays).any()))

return m

Expand Down Expand Up @@ -242,7 +256,7 @@ def elaborate(self, platform):

# Forward the inputs to the outputs as-is. This includes the clock; it is overridden below
# if the clocker is used (not bypassed).
for i_parts, o_parts in _iter_ioshape("io", self._ioshape,
for _, i_parts, o_parts in _iter_ioshape("io", self._ioshape,
self.i_stream.p.port, self.o_stream.p.port):
m.d.comb += o_parts.o .eq(i_parts.o.replicate(self._o_ratio))
m.d.comb += o_parts.oe.eq(i_parts.oe)
Expand All @@ -256,7 +270,10 @@ def elaborate(self, platform):
if self._o_ratio == 1:
m.d.comb += self.o_stream.p.port[self._clock].o.eq(phase)
if self._o_ratio == 2:
m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(~phase, phase))
with m.If(self.divisor == 0):
m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(~phase, phase))
with m.Else():
m.d.comb += self.o_stream.p.port[self._clock].o.eq(Cat(phase, phase))
m.d.comb += self.o_stream.p.port[self._clock].oe.eq(1)
# ... while requesting input sampling only for the rising edge. (Interfaces triggering
# transfers on falling edge will be inverting the clock at the `IOPort` level.)
Expand All @@ -274,11 +291,13 @@ def elaborate(self, platform):
m.d.comb += self.i_stream.ready.eq(self.o_stream.ready)

with m.Else(): # Produce a falling edge at the output.
# Whenever DDR output is used, `phase == 1` outputs a low state first and
# a high state second. When `phase == 1` payloads are output back to back
# (in DDR mode only!) this generates a pulse train with data changes
# coinciding with the falling edges. Setting `divisor == 0` in this mode
# allows clocking the peripheral at the `sync` frequency.
# Whenever DDR output is used, with `divisor == 0`, we output a low state
# on the first half of the clock cycle, and a high state on the second half.
# This mode allows clocking the peripheral at the `sync` frequency.
# In this case the signal sampled at the rising edge will be output on i[1]
# (if sample_delay was set to zero)
# In all other cases the signal sampled at the rising edge will be output on i[0]
# (if sample_delay was set to zero)
with m.If((self._o_ratio == 2) & (self.divisor == 0)):
m.d.comb += phase.eq(1)
with m.If(self.o_stream.ready):
Expand Down
15 changes: 13 additions & 2 deletions software/glasgow/gateware/qspi.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def elaborate(self, platform):


class QSPIController(wiring.Component):
def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False):
def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False, sample_delay_half_clocks=0):
assert len(ports.sck) == 1 and ports.sck.direction in (io.Direction.Output, io.Direction.Bidir)
assert len(ports.io) == 4 and ports.io.direction == io.Direction.Bidir
assert len(ports.cs) >= 1 and ports.cs.direction in (io.Direction.Output, io.Direction.Bidir)
Expand All @@ -162,6 +162,7 @@ def __init__(self, ports, *, chip_count=1, use_ddr_buffers=False):
)
self._ddr = use_ddr_buffers
self._chip_count = chip_count
self._sample_delay_half_clocks = sample_delay_half_clocks

super().__init__({
"o_octets": In(stream.Signature(data.StructLayout({
Expand Down Expand Up @@ -200,7 +201,8 @@ def elaborate(self, platform):
m.submodules.io_streamer = io_streamer = IOStreamer(ioshape, self._ports, init={
"sck": {"o": 1, "oe": 1}, # Motorola "Mode 3" with clock idling high
"cs": {"o": 0, "oe": 1}, # deselected
}, ratio=ratio, meta_layout=QSPIMode)
}, ratio=ratio, meta_layout=QSPIMode,
sample_delay_half_clocks=self._sample_delay_half_clocks)
connect(m, io_clocker=io_clocker.o_stream, io_streamer=io_streamer.o_stream)

m.submodules.deframer = deframer = QSPIDeframer()
Expand All @@ -214,6 +216,15 @@ def elaborate(self, platform):
io_streamer.i_stream.ready.eq(deframer.frames.ready),
]

if self._ddr:
with m.If(self.divisor == 0):
m.d.comb += [
deframer.frames.p.port.io0.i.eq(io_streamer.i_stream.p.port.io0.i[1]),
deframer.frames.p.port.io1.i.eq(io_streamer.i_stream.p.port.io1.i[1]),
deframer.frames.p.port.io2.i.eq(io_streamer.i_stream.p.port.io2.i[1]),
deframer.frames.p.port.io3.i.eq(io_streamer.i_stream.p.port.io3.i[1]),
]

connect(m, deframer=deframer.octets, controller=flipped(self.i_octets))

return m
Loading
Loading