24 #ifndef DPA_INTRIN_VERSION_USED
25 #define DPA_INTRIN_VERSION_USED (DPA_INTRIN_VERSION(1, 3))
28 #if (DPA_INTRIN_VERSION_USED == (DPA_INTRIN_VERSION(1, 3)))
30 #define __DPA_HEAP __MSPACE_HEAP
32 #define __DPA_MEMORY __MSPACE_MEMORY
34 #define __DPA_MMIO __MSPACE_MMIO
36 #define __DPA_SYSTEM __MSPACE_SYSTEM
39 #define __DPA_R __MOP_R
41 #define __DPA_W __MOP_W
43 #define __DPA_RW __MOP_RW
54 #define __dpa_thread_fence(MEMORY_SPACE, PRED_OP, SUCC_OP) \
55 __dpa_thread_fence_internal_1_3(MEMORY_SPACE, PRED_OP, SUCC_OP);
58 #define __dpa_thread_memory_fence(OP1, OP2) \
59 __dpa_thread_fence(__DPA_MEMORY, OP1, OP2)
62 #define __dpa_thread_outbox_fence(OP1, OP2) \
63 __dpa_thread_fence(__DPA_MMIO, OP1, OP2)
66 #define __dpa_thread_window_fence(OP1, OP2) \
67 __dpa_thread_fence(__DPA_MMIO, OP1, OP2)
70 #define __dpa_thread_system_fence() \
71 __dpa_thread_fence(__DPA_SYSTEM, __DPA_RW, __DPA_RW)
77 #define __dpa_thread_window_read_inv() \
78 __dpa_thread_fence(__DPA_MMIO, __DPA_R, __DPA_R)
85 #define __dpa_thread_window_writeback() \
86 __dpa_thread_fence(__DPA_MMIO, __DPA_W, __DPA_W)
88 #if __NV_DPA >= NV_DPA_CX8
90 #define __dpa_thread_l1_flush() \
91 __dpa_thread_l1_flush_internal_1_3()
94 #if __NV_DPA == __NV_DPA_BF3
100 #define __dpa_thread_memory_writeback() \
101 __dpa_thread_fence(__DPA_MEMORY, __DPA_W, __DPA_W)
103 #define __dpa_thread_memory_writeback() __dpa_compiler_barrier()
106 #if defined(__riscv_xrpfxp)
109 #define __dpa_fxp_rcp(N) __dpa_fxp_rcp_internal_1_3(N)
112 #define __dpa_fxp_pow2(N) __dpa_fxp_pow2_internal_1_3(N)
115 #define __dpa_fxp_log2(N) __dpa_fxp_log2_internal_1_3(N)
118 #if defined(__riscv_xnvcc)
119 #define __dpa_data_ignore(ADDR) __dpa_data_ignore_internal_1_3(ADDR)
127 #define __dpa_thread_cycles() __dpa_thread_cycles_internal_1_3()
133 #define __dpa_thread_inst_ret() __dpa_thread_inst_ret_internal_1_3()
138 #define __dpa_thread_time() __dpa_thread_time_internal_1_3()
140 #if __NV_DPA >= NV_DPA_CX8
145 #define __dpa_remote_atomic_load(PTR, MEMORDER) \
146 __dpa_remote_atomic_load_internal_1_3(PTR, MEMORDER)
154 #define __dpa_remote_atomic_exchange(PTR, VAL, MEMORDER) \
155 __dpa_remote_atomic_exchange_internal_1_3(PTR, VAL, MEMORDER)
164 #define __dpa_remote_atomic_add_fetch(PTR, VAL, MEMORDER) \
165 __dpa_remote_atomic_add_fetch_internal_1_3(PTR, VAL, MEMORDER)
166 #define __dpa_remote_atomic_sub_fetch(PTR, VAL, MEMORDER) \
167 __dpa_remote_atomic_sub_fetch_internal_1_3(PTR, VAL, MEMORDER)
176 #define __dpa_remote_atomic_fetch_add(PTR, VAL, MEMORDER) \
177 __dpa_remote_atomic_fetch_add_internal_1_3(PTR, VAL, MEMORDER)
178 #define __dpa_remote_atomic_fetch_sub(PTR, VAL, MEMORDER) \
179 __dpa_remote_atomic_fetch_sub_internal_1_3(PTR, VAL, MEMORDER)
181 #if defined(__riscv_xcas)
191 #define __dpa_remote_atomic_compare_exchange(PTR, EXPECTED, DESIRED, \
192 SUCC_MEMORDER, FAIL_MEMORDER) \
193 __dpa_remote_atomic_compare_exchange_internal_1_3(PTR, EXPECTED, DESIRED, \
194 SUCC_MEMORDER, FAIL_MEMORDER)
211 #define __extract_fields(dst, src, ...) \
212 __extract_fields_internal(dst, src, \
213 sizeof((int []) {__VA_ARGS__}) / sizeof(int) / 3, \
214 (int []) {__VA_ARGS__})
230 void __extract_fields_internal (uint64_t *restrict dst, uint64_t *restrict
src,
237 int dst_offset = varg[i*3] * 8;
238 int src_offset = varg[i*3+1] * 8;
239 int length = varg[i*3+2] * 8;
247 if (dst_offset % 64) {
248 int segment_offset = dst_offset % 64;
249 uint64_t srcmask = -1l;
250 if (segment_offset +
length < 64)
252 srcmask <<= segment_offset;
253 uint64_t dstmask = ~srcmask;
254 uint64_t srcval = __extract_64bits(
src, src_offset, 64 - segment_offset) << segment_offset;
255 uint64_t oldval = dst[dst_offset/64];
256 dst[dst_offset/64] = (srcval & srcmask) | (oldval & dstmask);
257 dst_offset += 64 - segment_offset;
258 src_offset += 64 - segment_offset;
259 length -= 64 - segment_offset;
266 dst[dst_offset/64] = __extract_64bits(
src, src_offset, 64);
276 uint64_t oldvalmasked = dst[dst_offset/64] & (-1l <<
length);
277 dst[dst_offset/64] = srcval | oldvalmasked;
281 int num_bytes = -varg[i*3+2];
282 int offset = varg[i*3];
283 uint64_t val = dst[offset/8];
284 uint64_t swapped = val >> (offset % 8)*8;
288 swapped = __builtin_bswap16(swapped);
291 swapped = __builtin_bswap32(swapped);
294 swapped = __builtin_bswap64(swapped);
297 swapped <<= (offset % 8)*8;
298 uint64_t mask = (1l << (offset % 8)*8) - 1;
299 int upper_part = offset % 8 + num_bytes;
301 mask |= -1l << upper_part * 8;
302 dst[offset/8] = swapped | (val & mask);
309 #error Bad value for DPA_INTRIN_VERSION_USED
static uint64_t *restrict src
static __attribute__((always_inline)) uint64_t __extract_64bits(uint64_t arr[]
ARG varg integer array of triplets (dst offset, src offset, length in bytes)
static uint64_t *restrict int extract_count