Cache-Simulator/src/Simulation/Cache.h at master · dominikw1/Cache-Simulator · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
#pragma once

#include "../Request.h"
#include "Cacheline.h"
#include "DecomposedAddress.h"
#include "Policy/ReplacementPolicy.h"
#include "SubRequest.h"
#include "WriteBuffer.h"

#include <algorithm>
#include <cstdint>
#include <iostream>
#include <memory>
#include <type_traits>
#include <typeinfo>
#include <unordered_map>

#include <systemc>

enum class MappingType { Direct, Fully_Associative };

constexpr std::uint16_t RAM_READ_BUS_SIZE_IN_BYTE{16}; // NOT a config value, just a transparent way to access
constexpr std::uint16_t BITS_IN_BYTE{8}; // we could use the systemc BITS_PER_BYTE, but this gives more transparency
constexpr std::uint16_t WRITE_BUFFER_SIZE{4}; // chosen by fair dice roll. guaranteed to be optimal :)

/**
 * This module represents a Cache of a certain mapping type (Direct / Fully associative). It is meant to be connected
 * to a CPU and a RAM module.
 *
 * The data bus size requirements are: a 32 bit busses for in/out for the CPU and a 128 bit bus
 * for reads from the RAM and a 32 bit bus for writes to the RAM.
 *
 * Communication happens through a variation of the Ready/Valid Protocol. In our implementation, the requesting party
 * sets the corresponding valid-request signal to true. The responding party then performs the operation and once it is
 * ready to transmit sets its ready signal to true. There is no separate "ready to receive" from the receiving party as
 * in our simplified simulation we always are waiting just for the ready-to-send signal.
 *
 * All operations happen on rising clock edge.
 *
 * This cache uses a write buffer able to buffer WRITE_BUFFER_SIZE writes at once. See its documentation for more
 * detail. Its optimisation can be turned off by compiling with definition STRICT_INSTRUCTION_ORDER.
 *
 */
template <MappingType mappingType> SC_MODULE(Cache) {
  public:
    // ====================================== External Ports  ======================================
    // Global Clock
    sc_core::sc_in<bool> SC_NAMED(clock);

    // CPU -> Cache
    sc_core::sc_in<std::uint32_t> SC_NAMED(cpuAddrBus);
    sc_core::sc_in<std::uint32_t> SC_NAMED(cpuDataInBus);
    sc_core::sc_in<bool> SC_NAMED(cpuWeBus);
    sc_core::sc_in<bool> SC_NAMED(cpuValidRequest);

    // Cache -> RAM
    sc_core::sc_out<std::uint32_t> SC_NAMED(memoryAddrBus);
    sc_core::sc_out<std::uint32_t> SC_NAMED(memoryDataOutBus);
    sc_core::sc_out<bool> SC_NAMED(memoryWeBus);
    sc_core::sc_out<bool> SC_NAMED(memoryValidRequestBus);

    // RAM -> Cache
    sc_core::sc_in<sc_dt::sc_bv<RAM_READ_BUS_SIZE_IN_BYTE * BITS_IN_BYTE>> SC_NAMED(memoryDataInBus);
    sc_core::sc_in<bool> SC_NAMED(memoryReadyBus);

    // Cache -> CPU
    sc_core::sc_out<bool> SC_NAMED(ready);
    sc_core::sc_out<std::uint32_t> SC_NAMED(cpuDataOutBus);

  private:
    // ====================================== Internal Signals  ======================================
    // Buffer -> Cache
    sc_core::sc_signal<bool, sc_core::SC_MANY_WRITERS> SC_NAMED(writeBufferReady);
    sc_core::sc_signal<sc_dt::sc_bv<RAM_READ_BUS_SIZE_IN_BYTE * BITS_IN_BYTE>> SC_NAMED(writeBufferDataOut);

    // Cache -> Buffer
    sc_core::sc_signal<std::uint32_t> SC_NAMED(writeBufferAddr);
    sc_core::sc_signal<std::uint32_t> SC_NAMED(writeBufferDataIn);
    sc_core::sc_signal<bool> SC_NAMED(writeBufferWE);
    sc_core::sc_signal<bool> SC_NAMED(writeBufferValidRequest);

  public:
    // ====================================== Hit/Miss Bookkeeping  ======================================
    std::uint64_t hitCount{0};
    std::uint64_t missCount{0};

  private:
    // ====================================== Config  ======================================
    std::uint32_t numCacheLines{0};
    std::uint32_t cacheLineSize{0}; // in Byte
    std::uint32_t cacheLatency{0};  // in Cycles
    std::unique_ptr<ReplacementPolicy<std::uint32_t>> replacementPolicy{nullptr};
#ifdef STRICT_INSTRUCTION_ORDER
    std::uint32_t memoryLatency{0};
#endif

    // ====================================== Internals ======================================
    std::vector<Cacheline> cacheInternal;
    WriteBuffer<WRITE_BUFFER_SIZE> writeBuffer;

    struct Empty {}; // we only want to pay the price for having a hash-table if we need it
    struct CachelineLookupTableType : std::conditional<mappingType == MappingType::Fully_Associative,
                                                       std::unordered_map<std::uint32_t, std::uint32_t>, Empty>::type {
        std::uint32_t numCacheLinesUsed{0};
    } cachelineLookupTable;

    // ====================================== Precomputation ======================================
    std::uint32_t addressOffsetBits{0};
    std::uint32_t addressIndexBits{0};
    std::uint32_t addressTagBits{0};
    std::uint32_t addressOffsetBitMask{0};
    std::uint32_t addressIndexBitMask{0};
    std::uint32_t addressTagBitMask{0};

  public:
    /**
     * Constructs a write-buffering cache.
     * @param[in] name  The name systemc assigns to this module.
     * @param[in] numCacheLines The number of cache lines the cache will have. Has to be > 0.
     * @param[in] cacheLineSize The number of bytes a cacheline holds. Has to be a multiple of the memory bus size 16B
     * and > 0.
     * @param[in] cacheLatency The number of cycles the cache takes to find out whether an access results in a hit or a
     * miss. Mind that the total number of cycles until data has been fully transferred is strictly larger than this
     * value, as the transferring takes some cycles itself.
     * @param[in] policy Optional parameter - the replacement policy taking effect when the cache is full and a new
     * entry shall be stored. Only relevant if MappingType is Fully_Associative, results in a warning on Direct if not
     * null_ptr. Takes ownership of the policy. Default value is nullptr.
     */
    Cache(sc_core::sc_module_name name, std::uint32_t numCacheLines, std::uint32_t cacheLineSize,
          std::uint32_t cacheLatency, std::unique_ptr<ReplacementPolicy<std::uint32_t>> policy = nullptr);
    /**
     * Approximates the primitive gate count used to construct this cache
     * @returns An approximation of the amount of primitive gates within this caches
     */
    std::size_t calculateGateCount() const noexcept;

    /**
     * Adds internal signals to and from write buffer to the trace file
     * @param[in] traceFile The trace file the signals shall be added to
     */
    void traceInternalSignals(sc_core::sc_trace_file* const traceFile) const;

#ifdef STRICT_INSTRUCTION_ORDER
    /**
     * Sets the correct memory latency this system experiences. This is only to be used if one insists on bypassing
     * the speedup of the write buffer.
     * @param[in] memoryLatency The latency of the RAM
     */
    void setMemoryLatency(std::uint32_t memoryLatency);
#endif

  private:
    // ====================================== Set-Up ======================================
    SC_CTOR(Cache); // private since this is never to be called, just to get systemc typedef

    /**
     * Connect the internal write buffer to the external ports of the cache and internal signals used to communicate
     * with it.
     */
    void setUpWriteBufferConnects() noexcept;
    /**
     * Initialise all cachelines with 0 bytes.
     */
    void zeroInitialiseCachelines() noexcept;
    /**
     * Precomputes what (and how many) bits of an address correspond to tag, index and offset in our cache. Furthermore
     * preconstructs bit masks to extract those values.
     */
    void precomputeAddressDecompositionBits() noexcept;

    // ========== Main Request Handling ==============
    /**
     * The main point of entry for each request. The request gets split up into non-cacheline crossing subrequests and
     * the appropriate signals to the outside world are set
     */
    void handleRequest() noexcept;
    /**
     * Handles the cacheline-internal requests and controls the actual performance like loading the cacheline from RAM
     * if not present, controlling the writes and reads, etc.
     * @param[in] subRequest  The subrequest to be executed
     * @param[out] readData The data just read in this subrequest. Undefined in write requests.
     */
    void handleSubRequest(SubRequest subRequest, std::uint32_t & readData) noexcept;
    /**
     * Constructs a request object by reading the busses written to by the CPU for easier internal handling.
     * @returns the constructed request
     */
    Request constructRequestFromBusses() const noexcept;

    // ====================================== Helpers to determine which cache line to read from / write to
    // ======================================
    /**
     * Use precomputed masks to decompose address into tag, index and offset
     * @param[in] address The address to be decomposed
     * @returns The address decomposed into tag, index and offset
     */
    DecomposedAddress decomposeAddress(std::uint32_t address) noexcept;
    /**
     * Use address decomposed into tag, index and offset to find a cacheline in the cache that is already "owned" by
     * this address, meaning the tag matches and it is a valid cacheline. How this cacheline is found is determined by
     * the Mapping Type
     * @param[in] decomposedAddr  The address decomposed into tag, index, offset
     * @returns an iterator to the cacheline we own. Returns end() iterator if none found
     */
    std::vector<Cacheline>::iterator getCachelineOwnedByAddr(const DecomposedAddress& decomposedAddr) noexcept;
    /**
     * Determine which cacheline the read from RAM shall be read into. How this is chosen depends on the MappingType
     * @param[in] decomposedAddr  The address decomposed into tag, index, offset
     * @returns an iterator to the cacheline to be read into. Always a valid iterator
     */
    std::vector<Cacheline>::iterator chooseWhichCachelineToFillFromRAM(const DecomposedAddress& decomposedAddr);
    /**
     * If this is a fully associative cache with a stateful policy (e.g. LRU), this updates the aforementioned state. If
     * direct mapped, this is a NOP
     * @param[in] cacheline The cacheline an operation was performed on
     */
    void registerUsage(std::vector<Cacheline>::iterator cacheline) noexcept;

    // ====================================== Reading from Cache ======================================
    /**
     * Determines whether we have a cache hit or not and fetches the cacheline from RAM if it's a miss
     * @param[in] addr  The address we want to do an operation on
     * @param[in] decomposedAddr  The address pre-decomposed into tag, index and offset
     * @returns an iterator to the cacheline (now) populated with the correct data corresponding to the address
     */
    std::vector<Cacheline>::iterator fetchIfNotPresent(std::uint32_t addr,
                                                       const DecomposedAddress& decomposedAddr) noexcept;
    /**
     * Sends request to RAM through Write Buffer to read in cacheline.
     * @param[in] addr  Cacheline-size aligned addr for the first byte to be read
     */
    void startReadFromRAM(std::uint32_t addr) noexcept;
    /**
     * Reads a data segment from cacheline
     * @param[in] decomposedAddr  The address decomposed into tag, index and offset
     * @param[in] cacheline The cacheline to be read from
     * @param[in] numBytes  The amount of bytes we want to read
     * @returns the data segment just read in the lowest numBytes bytes of the uint32_t
     * */
    std::uint32_t doRead(const DecomposedAddress& decomposedAddr, Cacheline& cacheline,
                         std::uint32_t numBytes) noexcept;
    /**
     * Reads data from bus written to by RAM and copies it into the corresponding cacheline
     * */
    std::vector<Cacheline>::iterator writeRAMReadIntoCacheline(const DecomposedAddress& decomposedAddr) noexcept;

    // ====================================== Writing to Cache ======================================
    /**
     * Writes the data into the cacheline
     * @param[in] cacheline The cacheline to be written into
     * @param[in] decomposedAddr The address decomposed into tag, index and offset
     * @param[in] data The data to be written
     * @param[in] numBytes The number of bytes of data to be written
     */
    void doWrite(Cacheline & cacheline, const DecomposedAddress& decomposedAddr, std::uint32_t data,
                 std::uint32_t numBytes) noexcept;
    /**
     * Passes the write request to the RAM through the write buffer
     * @param[in] cacheline The cacheline the data is taken from
     * @param[in] decomposedAddr The address decomposed into tag, index and offset
     * @param[in] addr The actual address in RAM we want to write to
     */
    void passWriteOnToRAM(Cacheline & cacheline, const DecomposedAddress& decomposedAddr, std::uint32_t addr) noexcept;

    // ====================================== Waiting Helpers ======================================
    /**
     * Sleeps for cacheLatency cycles
     */
    void waitOutCacheLatency() noexcept;
    /**
     * Sleeps for until we get a ready signal from RAM (through write buffer)
     */
    void waitForRAM() noexcept;
};