NVIDIA DOCA SDK Data Center on a Chip Framework Documentation
dpaintrin.h
Go to the documentation of this file.
1 /*
2  * NVIDIA_COPYRIGHT_BEGIN
3  *
4  * Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
5  *
6  * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
7  * property and proprietary rights in and to this material, related
8  * documentation and any modifications thereto. Any use, reproduction,
9  * disclosure or distribution of this material and related documentation
10  * without an express license agreement from NVIDIA CORPORATION or
11  * its affiliates is strictly prohibited.
12  *
13  * NVIDIA_COPYRIGHT_END
14  */
15 
16 //===--------- dpaintrin.h - Header file for all DPA intrinsics -----------===
17 #ifndef __DPAINTRIN_H
18 #define __DPAINTRIN_H
19 
20 /*
21  * Users need to define following macro before including this header file to
22  * use a specific version of DPA intrinsics
23  */
24 #ifndef DPA_INTRIN_VERSION_USED
25 #define DPA_INTRIN_VERSION_USED (DPA_INTRIN_VERSION(1, 3))
26 #endif
27 
28 #if (DPA_INTRIN_VERSION_USED == (DPA_INTRIN_VERSION(1, 3)))
30 #define __DPA_HEAP __MSPACE_HEAP
32 #define __DPA_MEMORY __MSPACE_MEMORY
34 #define __DPA_MMIO __MSPACE_MMIO
36 #define __DPA_SYSTEM __MSPACE_SYSTEM
37 
39 #define __DPA_R __MOP_R
41 #define __DPA_W __MOP_W
43 #define __DPA_RW __MOP_RW
44 
54 #define __dpa_thread_fence(MEMORY_SPACE, PRED_OP, SUCC_OP) \
55  __dpa_thread_fence_internal_1_3(MEMORY_SPACE, PRED_OP, SUCC_OP);
56 
58 #define __dpa_thread_memory_fence(OP1, OP2) \
59  __dpa_thread_fence(__DPA_MEMORY, OP1, OP2)
60 
62 #define __dpa_thread_outbox_fence(OP1, OP2) \
63  __dpa_thread_fence(__DPA_MMIO, OP1, OP2)
64 
66 #define __dpa_thread_window_fence(OP1, OP2) \
67  __dpa_thread_fence(__DPA_MMIO, OP1, OP2)
68 
70 #define __dpa_thread_system_fence() \
71  __dpa_thread_fence(__DPA_SYSTEM, __DPA_RW, __DPA_RW)
72 
77 #define __dpa_thread_window_read_inv() \
78  __dpa_thread_fence(__DPA_MMIO, __DPA_R, __DPA_R)
79 
85 #define __dpa_thread_window_writeback() \
86  __dpa_thread_fence(__DPA_MMIO, __DPA_W, __DPA_W)
87 
88 #if __NV_DPA >= NV_DPA_CX8
90 #define __dpa_thread_l1_flush() \
91  __dpa_thread_l1_flush_internal_1_3()
92 #endif
93 
94 #if __NV_DPA == __NV_DPA_BF3
100 #define __dpa_thread_memory_writeback() \
101  __dpa_thread_fence(__DPA_MEMORY, __DPA_W, __DPA_W)
102 #else
103 #define __dpa_thread_memory_writeback() __dpa_compiler_barrier()
104 #endif
105 
106 #if defined(__riscv_xrpfxp)
109 #define __dpa_fxp_rcp(N) __dpa_fxp_rcp_internal_1_3(N)
112 #define __dpa_fxp_pow2(N) __dpa_fxp_pow2_internal_1_3(N)
115 #define __dpa_fxp_log2(N) __dpa_fxp_log2_internal_1_3(N)
116 #endif // __riscv_xrpfxp
117 
118 #if defined(__riscv_xnvcc)
119 #define __dpa_data_ignore(ADDR) __dpa_data_ignore_internal_1_3(ADDR)
120 #endif // __riscv_xnvcc
121 
127 #define __dpa_thread_cycles() __dpa_thread_cycles_internal_1_3()
133 #define __dpa_thread_inst_ret() __dpa_thread_inst_ret_internal_1_3()
138 #define __dpa_thread_time() __dpa_thread_time_internal_1_3()
139 
140 #if __NV_DPA >= NV_DPA_CX8
145 #define __dpa_remote_atomic_load(PTR, MEMORDER) \
146  __dpa_remote_atomic_load_internal_1_3(PTR, MEMORDER)
147 
154 #define __dpa_remote_atomic_exchange(PTR, VAL, MEMORDER) \
155  __dpa_remote_atomic_exchange_internal_1_3(PTR, VAL, MEMORDER)
156 
164 #define __dpa_remote_atomic_add_fetch(PTR, VAL, MEMORDER) \
165  __dpa_remote_atomic_add_fetch_internal_1_3(PTR, VAL, MEMORDER)
166 #define __dpa_remote_atomic_sub_fetch(PTR, VAL, MEMORDER) \
167  __dpa_remote_atomic_sub_fetch_internal_1_3(PTR, VAL, MEMORDER)
168 
176 #define __dpa_remote_atomic_fetch_add(PTR, VAL, MEMORDER) \
177  __dpa_remote_atomic_fetch_add_internal_1_3(PTR, VAL, MEMORDER)
178 #define __dpa_remote_atomic_fetch_sub(PTR, VAL, MEMORDER) \
179  __dpa_remote_atomic_fetch_sub_internal_1_3(PTR, VAL, MEMORDER)
180 
181 #if defined(__riscv_xcas)
191 #define __dpa_remote_atomic_compare_exchange(PTR, EXPECTED, DESIRED, \
192  SUCC_MEMORDER, FAIL_MEMORDER) \
193  __dpa_remote_atomic_compare_exchange_internal_1_3(PTR, EXPECTED, DESIRED, \
194  SUCC_MEMORDER, FAIL_MEMORDER)
195 #endif
196 
197 #endif
198 
199 #include <stdint.h>
211 #define __extract_fields(dst, src, ...) \
212  __extract_fields_internal(dst, src, \
213  sizeof((int []) {__VA_ARGS__}) / sizeof(int) / 3, \
214  (int []) {__VA_ARGS__})
215 
218 static inline __attribute__((always_inline))
219 uint64_t __extract_64bits(uint64_t arr[], int bitoffset, int bitlength)
220 {
221  uint64_t result = arr[bitoffset/64] >> (bitoffset % 64);
222  // read the remaining part from the next index
223  if (bitoffset % 64 + bitlength > 64)
224  result |= arr[bitoffset/64+1] << (64 - bitoffset % 64);
225  return result;
226 }
227 
229 static inline __attribute__((always_inline))
230 void __extract_fields_internal (uint64_t *restrict dst, uint64_t *restrict src,
231  int extract_count, int varg[])
232 {
233  #pragma unroll
234  for (int i = 0; i < extract_count; i++) {
235  int reverse = 0;
236  // triplet in bits
237  int dst_offset = varg[i*3] * 8;
238  int src_offset = varg[i*3+1] * 8;
239  int length = varg[i*3+2] * 8;
240  if (length < 0) {
241  length = -length;
242  reverse = 1;
243  } else if (length == 0)
244  continue;
245 
246  // initial suffix to copy to dst if it starts in the middle of an 8-byte boundary
247  if (dst_offset % 64) {
248  int segment_offset = dst_offset % 64;
249  uint64_t srcmask = -1l;
250  if (segment_offset + length < 64) // whole extraction is within the 8 byte
251  srcmask >>= 64 - length;
252  srcmask <<= segment_offset;
253  uint64_t dstmask = ~srcmask;
254  uint64_t srcval = __extract_64bits(src, src_offset, 64 - segment_offset) << segment_offset;
255  uint64_t oldval = dst[dst_offset/64];
256  dst[dst_offset/64] = (srcval & srcmask) | (oldval & dstmask);
257  dst_offset += 64 - segment_offset;
258  src_offset += 64 - segment_offset;
259  length -= 64 - segment_offset;
260  if (length < 0)
261  length = 0;
262  }
263 
264  // copy whole 8-byte segments
265  while (length / 64) {
266  dst[dst_offset/64] = __extract_64bits(src, src_offset, 64);
267  dst_offset += 64;
268  src_offset += 64;
269  length -= 64;
270  }
271 
272  // remaining prefix
273  if (length) {
274  uint64_t srcval =
275  __extract_64bits(src, src_offset, length) & ((1l<<length)-1);
276  uint64_t oldvalmasked = dst[dst_offset/64] & (-1l << length);
277  dst[dst_offset/64] = srcval | oldvalmasked;
278  }
279 
280  if (reverse) {
281  int num_bytes = -varg[i*3+2];
282  int offset = varg[i*3];
283  uint64_t val = dst[offset/8];
284  uint64_t swapped = val >> (offset % 8)*8;
285  switch (num_bytes)
286  {
287  case 2:
288  swapped = __builtin_bswap16(swapped);
289  break;
290  case 4:
291  swapped = __builtin_bswap32(swapped);
292  break;
293  case 8:
294  swapped = __builtin_bswap64(swapped);
295  break;
296  }
297  swapped <<= (offset % 8)*8;
298  uint64_t mask = (1l << (offset % 8)*8) - 1;
299  int upper_part = offset % 8 + num_bytes;
300  if (upper_part < 8)
301  mask |= -1l << upper_part * 8;
302  dst[offset/8] = swapped | (val & mask);
303  }
304  }
305 }
306 
307 #else
308 
309 #error Bad value for DPA_INTRIN_VERSION_USED
310 
311 #endif // DPA_INTRIN_VERSION_USED
312 
313 #endif // __DPAINTRIN_H
static uint64_t *restrict src
Definition: dpaintrin.h:230
static __attribute__((always_inline)) uint64_t __extract_64bits(uint64_t arr[]
ARG varg integer array of triplets (dst offset, src offset, length in bytes)
static int bitoffset
Definition: dpaintrin.h:219
static int int bitlength
Definition: dpaintrin.h:220
static uint64_t *restrict int extract_count
Definition: dpaintrin.h:231
return result
Definition: dpaintrin.h:225