Program Listing for File CUDAStream.h¶
↰ Return to documentation for file (c10/cuda/CUDAStream.h
)
#pragma once
#include <cuda_runtime_api.h>
#include <c10/core/DeviceGuard.h>
#include <c10/core/Stream.h>
#include <c10/cuda/CUDAFunctions.h>
#include <c10/util/Exception.h>
/*
* Stream pool note.
*
* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
* are backed by cuStreams, but they use several pools to minimize the costs
* associated with creating, retaining, and destroying cuStreams.
*
* There are three pools per device, and a device's pools are lazily created.
*
* The first pool contains only the default stream. When the default stream
* is requested it's returned.
*
* The second pool is the "low priority" or "default priority" streams. In
* HIP builds there is no distinction between streams in this pool and streams
* in the third pool (below). There are 32 of these streams per device, and
* when a stream is requested one of these streams is returned round-robin.
* That is, the first stream requested is at index 0, the second at index 1...
* to index 31, then index 0 again.
*
* This means that if 33 low priority streams are requested, the first and
* last streams requested are actually the same stream (under the covers)
* and kernels enqueued on them cannot run concurrently.
*
* The third pool is the "high priority" streams. The third pool acts like
* the second pool except the streams are created with a higher priority.
*
* These pools suggest that stream users should prefer many short-lived streams,
* as the cost of acquiring and releasing streams is effectively zero. If
* many longer-lived streams are required in performance critical scenarios
* then the functionality here may need to be extended to allow, for example,
* "reserving" a subset of the pool so that other streams do not accidentally
* overlap the performance critical streams.
*
* Note: although the notion of "current stream for device" is thread local
* (every OS thread has a separate current stream, as one might expect),
* the stream pool is global across all threads; stream 0 is always stream 0
* no matter which thread you use it on. Multiple threads can synchronize
* on the same stream. Although the CUDA documentation is not very clear
* on the matter, streams are thread safe; e.g., it is safe to enqueue
* a kernel on the same stream from two different threads.
*/
namespace c10::cuda {
static constexpr int max_compile_time_stream_priorities = 4;
// Value object representing a CUDA stream. This is just a wrapper
// around c10::Stream, but it comes with a little extra CUDA-specific
// functionality (conversion to cudaStream_t), and a guarantee that
// the wrapped c10::Stream really is a CUDA stream.
class C10_CUDA_API CUDAStream {
public:
enum Unchecked { UNCHECKED };
explicit CUDAStream(Stream stream) : stream_(stream) {
TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
}
explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
bool operator==(const CUDAStream& other) const noexcept {
return unwrap() == other.unwrap();
}
bool operator!=(const CUDAStream& other) const noexcept {
return unwrap() != other.unwrap();
}
operator cudaStream_t() const {
return stream();
}
operator Stream() const {
return unwrap();
}
DeviceType device_type() const {
return DeviceType::CUDA;
}
DeviceIndex device_index() const {
return stream_.device_index();
}
Device device() const {
return Device(DeviceType::CUDA, device_index());
}
StreamId id() const {
return stream_.id();
}
bool query() const {
DeviceGuard guard{stream_.device()};
cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
if (err == cudaSuccess) {
return true;
} else if (err != cudaErrorNotReady) {
C10_CUDA_CHECK(err);
} else {
// ignore and clear the error if not ready
(void)cudaGetLastError();
}
return false;
}
void synchronize() const {
DeviceGuard guard{stream_.device()};
c10::cuda::stream_synchronize(stream());
}
int priority() const {
DeviceGuard guard{stream_.device()};
int priority = 0;
C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
return priority;
}
cudaStream_t stream() const;
Stream unwrap() const {
return stream_;
}
struct c10::StreamData3 pack3() const {
return stream_.pack3();
}
// Unpack a CUDAStream from the 3 fields generated by pack().
static CUDAStream unpack3(
StreamId stream_id,
DeviceIndex device_index,
DeviceType device_type) {
return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
}
static std::tuple<int, int> priority_range() {
// Note: this returns the range of priority **supported by PyTorch**, not
// the range of priority **supported by CUDA**. The former is a subset of
// the latter.
int least_priority = 0, greatest_priority = 0;
C10_CUDA_CHECK(
cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
#ifdef USE_ROCM
// See Note [HIP stream priorities]
TORCH_INTERNAL_ASSERT(
least_priority == 1, "Unexpected HIP stream priority range");
least_priority = 0;
#else
TORCH_INTERNAL_ASSERT(
least_priority == 0, "Unexpected CUDA stream priority range");
#endif
TORCH_INTERNAL_ASSERT(
greatest_priority <= -1, "Unexpected CUDA stream priority range");
greatest_priority = std::max(
-c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
return std::make_tuple(least_priority, greatest_priority);
}
// Deleted for now; use CUDAEvent::block instead
// void synchronize_with(const CUDAEvent& event) const;
private:
Stream stream_;
};
C10_API CUDAStream
getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
// no default priority to disambiguate overloads
C10_API CUDAStream
getStreamFromPool(const int priority, DeviceIndex device = -1);
C10_API CUDAStream
getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
C10_API void setCurrentCUDAStream(CUDAStream stream);
C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
} // namespace c10::cuda
namespace std {
template <>
struct hash<c10::cuda::CUDAStream> {
size_t operator()(c10::cuda::CUDAStream s) const noexcept {
return std::hash<c10::Stream>{}(s.unwrap());
}
};
} // namespace std