pytroch Custom C++ and CUDA Extensions
pytroch Custom C++ and CUDA Extensions
大聪明模式
就是嫌弃原来的pytorch慢,然后自定义一个模块,用c++实现,然后写个setup安装到你的python的site-pakege中。当作一个新的模块供你调用。
官方教程:https://pytorch.apachecn.org/docs/1.0/cpp_extension.html
野路子教程:
C++ 扩展有两种形式:它们可以使用setuptools来进行“提前”构建,或者通过torch.utils.cpp_extension.load()来实现“实时”构建。
1 使用setuptools构建
1.1 写一个lltm.cpp
#include <torch/extension.h>
#include <vector>
#include <iostream>
torch::Tensor d_sigmoid(torch::Tensor z) {
auto s = torch::sigmoid(z);
return (1 - s) * s;
}
// tanh'(z) = 1 - tanh^2(z)
at::Tensor d_tanh(at::Tensor z) {
return 1 - z.tanh().pow(2);
}
// elu'(z) = relu'(z) + { alpha * exp(z) if (alpha * (exp(z) - 1)) < 0, else 0}
at::Tensor d_elu(at::Tensor z, at::Scalar alpha = 1.0) {
auto e = z.exp();
auto mask = (alpha * (e - 1)) < 0;
return (z > 0).type_as(z) + mask.type_as(z) * (alpha * e);
}
std::vector<at::Tensor> lltm_forward(
at::Tensor input,
at::Tensor weights,
at::Tensor bias,
at::Tensor old_h,
at::Tensor old_cell) {
auto X = at::cat({old_h, input}, /*dim=*/1);
auto gate_weights = at::addmm(bias, X, weights.transpose(0, 1));
auto gates = gate_weights.chunk(3, /*dim=*/1);
auto input_gate = at::sigmoid(gates[0]);
auto output_gate = at::sigmoid(gates[1]);
auto candidate_cell = at::elu(gates[2], /*alpha=*/1.0);
auto new_cell = old_cell + candidate_cell * input_gate;
auto new_h = at::tanh(new_cell) * output_gate;
return {new_h,
new_cell,
input_gate,
output_gate,
candidate_cell,
X,
gate_weights};
}
std::vector<at::Tensor> lltm_backward(
at::Tensor grad_h,
at::Tensor grad_cell,
at::Tensor new_cell,
at::Tensor input_gate,
at::Tensor output_gate,
at::Tensor candidate_cell,
at::Tensor X,
at::Tensor gate_weights,
at::Tensor weights) {
auto d_output_gate = at::tanh(new_cell) * grad_h;
auto d_tanh_new_cell = output_gate * grad_h;
auto d_new_cell = d_tanh(new_cell) * d_tanh_new_cell + grad_cell;
auto d_old_cell = d_new_cell;
auto d_candidate_cell = input_gate * d_new_cell;
auto d_input_gate = candidate_cell * d_new_cell;
auto gates = gate_weights.chunk(3, /*dim=*/1);
d_input_gate *= d_sigmoid(gates[0]);
d_output_gate *= d_sigmoid(gates[1]);
d_candidate_cell *= d_elu(gates[2]);
auto d_gates =
at::cat({d_input_gate, d_output_gate, d_candidate_cell}, /*dim=*/1);
auto d_weights = d_gates.t().mm(X);
auto d_bias = d_gates.sum(/*dim=*/0, /*keepdim=*/true);
auto d_X = d_gates.mm(weights);
const auto state_size = grad_h.size(1);
auto d_old_h = d_X.slice(/*dim=*/1, 0, state_size);
auto d_input = d_X.slice(/*dim=*/1, state_size);
return {d_old_h, d_input, d_weights, d_bias, d_old_cell};
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &lltm_forward, "LLTM forward");
m.def("backward", &lltm_backward, "LLTM backward");
}
1.2 写一个setup.py
from setuptools import setup, Extension
from torch.utils import cpp_extension
setup(name='lltm_cpp',
ext_modules=[cpp_extension.CppExtension('lltm_cpp', ['lltm.cpp'])],
cmdclass={'build_ext': cpp_extension.BuildExtension})
1.3 安装
你的目录应该类似这样:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# ls
lltm.cpp main2.py main.py setup.py
安装
python setup.py install
打印的结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python setup.py install
running install
running bdist_egg
running egg_info
creating lltm_cpp.egg-info
writing lltm_cpp.egg-info/PKG-INFO
writing dependency_links to lltm_cpp.egg-info/dependency_links.txt
writing top-level names to lltm_cpp.egg-info/top_level.txt
writing manifest file 'lltm_cpp.egg-info/SOURCES.txt'
reading manifest file 'lltm_cpp.egg-info/SOURCES.txt'
writing manifest file 'lltm_cpp.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_ext
building 'lltm_cpp' extension
creating build
creating build/temp.linux-x86_64-3.7
gcc -pthread -B /home/amzing/software/anaconda/enter/envs/pytracking/compiler_compat -Wl,--sysroot=/ -Wsign-compare -DNDEBUG -g -fwrapv -O3 -Wall -Wstrict-prototypes -fPIC -I/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/include -I/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/include/torch/csrc/api/include -I/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/include/TH -I/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/include/THC -I/home/amzing/software/anaconda/enter/envs/pytracking/include/python3.7m -c lltm.cpp -o build/temp.linux-x86_64-3.7/lltm.o -DTORCH_API_INCLUDE_EXTENSION_H -DTORCH_EXTENSION_NAME=lltm_cpp -D_GLIBCXX_USE_CXX11_ABI=0 -std=c++11
cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid for C/ObjC but not for C++
creating build/lib.linux-x86_64-3.7
g++ -pthread -shared -B /home/amzing/software/anaconda/enter/envs/pytracking/compiler_compat -L/home/amzing/software/anaconda/enter/envs/pytracking/lib -Wl,-rpath=/home/amzing/software/anaconda/enter/envs/pytracking/lib -Wl,--no-as-needed -Wl,--sysroot=/ build/temp.linux-x86_64-3.7/lltm.o -o build/lib.linux-x86_64-3.7/lltm_cpp.cpython-37m-x86_64-linux-gnu.so
creating build/bdist.linux-x86_64
creating build/bdist.linux-x86_64/egg
copying build/lib.linux-x86_64-3.7/lltm_cpp.cpython-37m-x86_64-linux-gnu.so -> build/bdist.linux-x86_64/egg
creating stub loader for lltm_cpp.cpython-37m-x86_64-linux-gnu.so
byte-compiling build/bdist.linux-x86_64/egg/lltm_cpp.py to lltm_cpp.cpython-37.pyc
creating build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm_cpp.egg-info/PKG-INFO -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm_cpp.egg-info/SOURCES.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm_cpp.egg-info/dependency_links.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
copying lltm_cpp.egg-info/top_level.txt -> build/bdist.linux-x86_64/egg/EGG-INFO
writing build/bdist.linux-x86_64/egg/EGG-INFO/native_libs.txt
zip_safe flag not set; analyzing archive contents...
__pycache__.lltm_cpp.cpython-37: module references __file__
creating dist
creating 'dist/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg' and adding 'build/bdist.linux-x86_64/egg' to it
removing 'build/bdist.linux-x86_64/egg' (and everything under it)
Processing lltm_cpp-0.0.0-py3.7-linux-x86_64.egg
creating /home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg
Extracting lltm_cpp-0.0.0-py3.7-linux-x86_64.egg to /home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages
Adding lltm-cpp 0.0.0 to easy-install.pth file
Installed /home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg
Processing dependencies for lltm-cpp==0.0.0
Finished processing dependencies for lltm-cpp==0.0.0
可以看到它安装到
Installed /home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg
这个地方了
1.4 测试
测试使用我们的lltm模块的速度:
main.py(cpu):
import torch
import math
import torch
import time
# Our module!
import lltm_cpp as lltm
class LLTMFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weights, bias, old_h, old_cell):
outputs = lltm.forward(input, weights, bias, old_h, old_cell)
new_h, new_cell = outputs[:2]
variables = outputs[1:] + [weights]
ctx.save_for_backward(*variables)
return new_h, new_cell
@staticmethod
def backward(ctx, grad_h, grad_cell):
outputs = lltm.backward(
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs
return d_input, d_weights, d_bias, d_old_h, d_old_cell
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
return LLTMFunction.apply(input, self.weights, self.bias, *state)
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)
rnn = LLTM(input_features, state_size)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main.py
main.py:23: DeprecationWarning: 'saved_variables' is deprecated; use 'saved_tensors'
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
Forward: 116.962 us | Backward 332.088 us
main_gpu.py:
import torch
import math
import torch
import time
# Our module!
import lltm_cpp as lltm
assert torch.cuda.is_available()
cuda_device = torch.device("cuda") # device object representing GPU
class LLTMFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weights, bias, old_h, old_cell):
outputs = lltm.forward(input, weights, bias, old_h, old_cell)
new_h, new_cell = outputs[:2]
variables = outputs[1:] + [weights]
ctx.save_for_backward(*variables)
return new_h, new_cell
@staticmethod
def backward(ctx, grad_h, grad_cell):
outputs = lltm.backward(
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs
return d_input, d_weights, d_bias, d_old_h, d_old_cell
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
return LLTMFunction.apply(input, self.weights, self.bias, *state)
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)
rnn = LLTM(input_features, state_size).to(cuda_device)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main_gpu.py
main_gpu.py:26: DeprecationWarning: 'saved_variables' is deprecated; use 'saved_tensors'
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
Forward: 171.483 us | Backward 592.045 us
测试不使用lltm模块,用pytorch的纯生代码:
main2(cpu).py
import torch
import math
import torch.nn.functional as F
import time
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
# 3 * state_size for input gate, output gate and candidate cell gate.
# input_features + state_size because we will multiply with [input, h].
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
old_h, old_cell = state
X = torch.cat([old_h, input], dim=1)
# Compute the input, output and candidate cell gates with one MM.
gate_weights = F.linear(X, self.weights, self.bias)
# Split the combined gate weight matrix into its components.
gates = gate_weights.chunk(3, dim=1)
input_gate = F.sigmoid(gates[0])
output_gate = F.sigmoid(gates[1])
# Here we use an ELU instead of the usual tanh.
candidate_cell = F.elu(gates[2])
# Compute the new cell state.
new_cell = old_cell + candidate_cell * input_gate
# Compute the new hidden state and output.
new_h = F.tanh(new_cell) * output_gate
return new_h, new_cell
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)
rnn = LLTM(input_features, state_size)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main2.py
/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/nn/functional.py:1351: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/nn/functional.py:1340: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.
warnings.warn("nn.functional.tanh is deprecated. Use torch.tanh instead.")
Forward: 165.565 us | Backward 241.552 us
main2_gpu.py:
import torch
import math
import torch.nn.functional as F
import time
assert torch.cuda.is_available()
cuda_device = torch.device("cuda") # device object representing GPU
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
# 3 * state_size for input gate, output gate and candidate cell gate.
# input_features + state_size because we will multiply with [input, h].
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
old_h, old_cell = state
X = torch.cat([old_h, input], dim=1)
# Compute the input, output and candidate cell gates with one MM.
gate_weights = F.linear(X, self.weights, self.bias)
# Split the combined gate weight matrix into its components.
gates = gate_weights.chunk(3, dim=1)
input_gate = F.sigmoid(gates[0])
output_gate = F.sigmoid(gates[1])
# Here we use an ELU instead of the usual tanh.
candidate_cell = F.elu(gates[2])
# Compute the new cell state.
new_cell = old_cell + candidate_cell * input_gate
# Compute the new hidden state and output.
new_h = F.tanh(new_cell) * output_gate
return new_h, new_cell
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)
rnn = LLTM(input_features, state_size).to(cuda_device)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main2_gpu.py
/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/nn/functional.py:1351: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.
warnings.warn("nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.")
/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/nn/functional.py:1340: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.
warnings.warn("nn.functional.tanh is deprecated. Use torch.tanh instead.")
Forward: 207.877 us | Backward 306.846 us
1.5对比
cpu用了:Forward: 116.962 us | Backward 332.088 us
cpu不用:Forward: 165.565 us | Backward 241.552 us
gpu用了:Forward: 171.483 us | Backward 592.045 us
gpu不用:Forward: 207.877 us | Backward 306.846 us
你说你是不是个大聪明?!
2 使用JIT构建
just in time
直接使用load函数,然后他自己调用ninja编译c++,好处就是不用写setup自己安装了,听说坏处是会慢一点
2.1测试
首先卸载之前安装的lltm:
pip uninstall lltm_cpp
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# pip uninstall lltm_cpp
Found existing installation: lltm-cpp 0.0.0
Uninstalling lltm-cpp-0.0.0:
Would remove:
/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/lltm_cpp-0.0.0-py3.7-linux-x86_64.egg
Proceed (y/n)? y
Successfully uninstalled lltm-cpp-0.0.0
WARNING: Running pip as root will break packages and permissions. You should install packages reliably by using venv: https://pip.pypa.io/warnings/venv
清理之前的文件后,你的目录大致长这样:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# ls
lltm.cpp main2_gpu.py main2.py main_gpu.py main.py setup.py
写main3_cpu.py:
import math
import torch
import time
# Our module!
from torch.utils.cpp_extension import load
lltm = load(name="lltm", sources=["lltm.cpp"], verbose=True)
class LLTMFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weights, bias, old_h, old_cell):
outputs = lltm.forward(input, weights, bias, old_h, old_cell)
new_h, new_cell = outputs[:2]
variables = outputs[1:] + [weights]
ctx.save_for_backward(*variables)
return new_h, new_cell
@staticmethod
def backward(ctx, grad_h, grad_cell):
outputs = lltm.backward(
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs
return d_input, d_weights, d_bias, d_old_h, d_old_cell
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
return LLTMFunction.apply(input, self.weights, self.bias, *state)
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
X = torch.randn(batch_size, input_features)
h = torch.randn(batch_size, state_size)
C = torch.randn(batch_size, state_size)
rnn = LLTM(input_features, state_size)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main3_cpu.py
Using /tmp/torch_extensions as PyTorch extensions root...
Emitting ninja build file /tmp/torch_extensions/lltm/build.ninja...
Building extension module lltm...
1.10.2
Loading extension module lltm...
Traceback (most recent call last):
File "main3_cpu.py", line 8, in <module>
lltm = load(name="lltm", sources=["lltm.cpp"], verbose=True)
File "/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 680, in load
is_python_module)
File "/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 877, in _jit_compile
return _import_module_from_library(name, build_directory, is_python_module)
File "/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/site-packages/torch/utils/cpp_extension.py", line 1084, in _import_module_from_library
file, path, description = imp.find_module(module_name, [path])
File "/home/amzing/software/anaconda/enter/envs/pytracking/lib/python3.7/imp.py", line 296, in find_module
raise ImportError(_ERR_MSG.format(name), name=name)
ImportError: No module named 'lltm'
这里是因为ninja只生成了/tmp/torch_extensions/lltm/build.ninja
但是他没有编译,恩,真tmd的沙雕
你需要自己去/tmp/torch_extensions/lltm/区编译,生存的文件放在那里就好,这边再运行他找的到,不知道为啥,反正它找到了,你不行就把他复制过来
cd tmp/torch_extensions/lltm
ninja -f build.ninja
结果:
(pytracking) root@amzing-ubuntu:/tmp/torch_extensions/lltm# ls
build.ninja
(pytracking) root@amzing-ubuntu:/tmp/torch_extensions/lltm# ninja -f build.ninja
[2/2] c++ lltm.o -shared -o lltm.so
(pytracking) root@amzing-ubuntu:/tmp/torch_extensions/lltm# ls
build.ninja lltm.o lltm.so
再去运行:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main3_cpu.py
Using /tmp/torch_extensions as PyTorch extensions root...
Emitting ninja build file /tmp/torch_extensions/lltm/build.ninja...
Building extension module lltm...
1.10.2
Loading extension module lltm...
main3_cpu.py:23: DeprecationWarning: 'saved_variables' is deprecated; use 'saved_tensors'
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
Forward: 146.736 us | Backward 380.893 us
main3_gpu.py
import math
import torch
import time
assert torch.cuda.is_available()
cuda_device = torch.device("cuda") # device object representing GPU
# Our module!
from torch.utils.cpp_extension import load
lltm = load(name="lltm", sources=["lltm.cpp"], verbose=True)
class LLTMFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, input, weights, bias, old_h, old_cell):
outputs = lltm.forward(input, weights, bias, old_h, old_cell)
new_h, new_cell = outputs[:2]
variables = outputs[1:] + [weights]
ctx.save_for_backward(*variables)
return new_h, new_cell
@staticmethod
def backward(ctx, grad_h, grad_cell):
outputs = lltm.backward(
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
d_old_h, d_input, d_weights, d_bias, d_old_cell = outputs
return d_input, d_weights, d_bias, d_old_h, d_old_cell
class LLTM(torch.nn.Module):
def __init__(self, input_features, state_size):
super(LLTM, self).__init__()
self.input_features = input_features
self.state_size = state_size
self.weights = torch.nn.Parameter(
torch.empty(3 * state_size, input_features + state_size))
self.bias = torch.nn.Parameter(torch.empty(3 * state_size))
self.reset_parameters()
def reset_parameters(self):
stdv = 1.0 / math.sqrt(self.state_size)
for weight in self.parameters():
weight.data.uniform_(-stdv, +stdv)
def forward(self, input, state):
return LLTMFunction.apply(input, self.weights, self.bias, *state)
if __name__ == '__main__':
batch_size = 16
input_features = 32
state_size = 128
# Note the device=cuda_device arguments here
X = torch.randn(batch_size, input_features, device=cuda_device)
h = torch.randn(batch_size, state_size, device=cuda_device)
C = torch.randn(batch_size, state_size, device=cuda_device)
rnn = LLTM(input_features, state_size).to(cuda_device)
forward = 0
backward = 0
for _ in range(100000):
start = time.time()
new_h, new_C = rnn(X, (h, C))
forward += time.time() - start
start = time.time()
(new_h.sum() + new_C.sum()).backward()
backward += time.time() - start
print('Forward: {:.3f} us | Backward {:.3f} us'.format(forward * 1e6/1e5, backward * 1e6/1e5))
结果:
(pytracking) root@amzing-ubuntu:/home/amzing/python/trycuda# python main3_gpu.py
Using /tmp/torch_extensions as PyTorch extensions root...
Emitting ninja build file /tmp/torch_extensions/lltm/build.ninja...
Building extension module lltm...
1.10.2
Loading extension module lltm...
main3_gpu.py:26: DeprecationWarning: 'saved_variables' is deprecated; use 'saved_tensors'
grad_h.contiguous(), grad_cell.contiguous(), *ctx.saved_variables)
Forward: 200.011 us | Backward 655.412 us
2.2 对比
cpu:Forward: 146.736 us | Backward 380.893 us
gpu:Forward: 200.011 us | Backward 655.412 us
整体对比:
setup cpu用了:Forward: 116.962 us | Backward 332.088 us
setup cpu不用:Forward: 165.565 us | Backward 241.552 us
setup gpu用了:Forward: 171.483 us | Backward 592.045 us
setup gpu不用:Forward: 207.877 us | Backward 306.846 us
JIT cpu:Forward: 146.736 us | Backward 380.893 us
JIT gpu:Forward: 200.011 us | Backward 655.412 us
JIT比setup还快一点?????????
真tmd是个大聪明
备注
我记得当初我还改过 torch.utils.cpp_extension文件中一千多行关于ninja检测版本的一个代码
把ninja -v改成了ninja -version
这次实验的时候用的conda环境,也不记得改,唉,你们万一遇到知道咋弄就行