#
# UCP basic device cuda tests
#
ucp_device_cuda_lat_8b_1thread               -t ucp_put_lat -m cuda -s 8 -n 10000
ucp_device_cuda_bw_1k_1thread                -t ucp_put_bw -m cuda -s 1024 -n 10000
ucp_device_cuda_lat_1k_1thread               -t ucp_put_lat -m cuda -s 1024 -n 10000

# Increase number of threads after following fixes:
# - Use thread-local memory instead of shared for requests (limit 48K)
# - Fix WQE size limit of 1024
ucp_device_cuda_bw_1k_32threads              -t ucp_put_bw -m cuda -s 1024 -n 10000 -T 32
ucp_device_cuda_lat_1k_32threads             -t ucp_put_lat -m cuda -s 1024 -n 10000 -T 32

ucp_device_cuda_bw_1k_1warp                -t ucp_put_bw -m cuda -s 1024 -n 10000 -T 32 -L warp
ucp_device_cuda_lat_1k_1warp               -t ucp_put_lat -m cuda -s 1024 -n 10000 -T 32 -L warp
