runtime/src/cuda_platform.h at e2906a3f67a56e0439eb6078f7a73d90a242e7d5 · AnyDSL/runtime · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#ifndef CUDA_PLATFORM_H
#define CUDA_PLATFORM_H

#include "platform.h"
#include "runtime.h"

#include <atomic>
#include <forward_list>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>

#define CUDA_API_PER_THREAD_DEFAULT_STREAM
#include <cuda.h>
#include <nvvm.h>

#if CUDA_VERSION < 6050
    #error "CUDA 6.5 or higher required!"
#endif

#ifdef AnyDSL_runtime_CUDA_NVRTC
#include <nvrtc.h>
#endif

/// CUDA platform. Has the same number of devices as that of the CUDA implementation.
class CudaPlatform : public Platform {
public:
    CudaPlatform(Runtime* runtime);
    ~CudaPlatform();

protected:
    void* alloc(DeviceId dev, int64_t size) override;
    void* alloc_host(DeviceId dev, int64_t size) override;
    void* alloc_unified(DeviceId dev, int64_t size) override;
    void* get_device_ptr(DeviceId, void* ptr) override;
    void release(DeviceId dev, void* ptr) override;
    void release_host(DeviceId dev, void* ptr) override;

    void launch_kernel(DeviceId dev, const LaunchParams& launch_params) override;
    void synchronize(DeviceId dev) override;

    void copy(DeviceId dev_src, const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
    void copy_from_host(const void* src, int64_t offset_src, DeviceId dev_dst, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;
    void copy_to_host(DeviceId dev_src, const void* src, int64_t offset_src, void* dst, int64_t offset_dst, int64_t size, bool hint_async) override;

    size_t dev_count() const override { return devices_.size(); }
    std::string name() const override { return "CUDA"; }
    const char* device_name(DeviceId dev) const override;
    bool device_check_feature_support(DeviceId dev, const char* feature) const override;

    typedef std::unordered_map<std::string, CUfunction> FunctionMap;

    struct DeviceData {
        CUdevice dev;
        CUcontext ctx;
        CUjit_target compute_capability;
        std::atomic_flag locked = ATOMIC_FLAG_INIT;
        std::unordered_map<std::string, CUmodule> modules;
        std::unordered_map<CUmodule, FunctionMap> functions;
        std::string name;

        DeviceData() {}
        DeviceData(const DeviceData&) = delete;
        DeviceData(DeviceData&& data)
            : dev(data.dev)
            , ctx(data.ctx)
            , compute_capability(data.compute_capability)
            , modules(std::move(data.modules))
            , functions(std::move(data.functions))
            , name(std::move(name))
        {}

        void lock() {
            while (locked.test_and_set(std::memory_order_acquire)) ;
        }

        void unlock() {
            locked.clear(std::memory_order_release);
        }
    };

    std::vector<DeviceData> devices_;

    bool dump_binaries = false;

    struct ProfileData {
        CudaPlatform* platform;
        CUcontext ctx;
        CUevent start;
        CUevent end;
    };

    std::mutex profile_lock_;
    std::forward_list<ProfileData*> profiles_;
    void erase_profiles(bool);

    CUfunction load_kernel(DeviceId dev, const std::string& filename, const std::string& kernelname);

    std::string compile_nvptx(DeviceId dev, const std::string& filename, const std::string& program_string) const;
    std::string compile_nvvm(DeviceId dev, const std::string& filename, const std::string& program_string) const;
    std::string compile_cuda(DeviceId dev, const std::string& filename, const std::string& program_string) const;
    CUmodule create_module(DeviceId dev, const std::string& filename, const std::string& ptx_string) const;
};

#endif