显存释放过程

import torch
import subprocess
import gc

def get_gpu_memory_map():
    result = subprocess.check_output([
        'nvidia-smi', '--query-gpu=memory.used',
        '--format=csv,nounits,noheader'
    ])

    return float(result)


mem_before = get_gpu_memory_map()
print(f"Before Op: {mem_before} MB")
x_ = torch.zeros(1024, 1024)

device = torch.device('cuda:0')
x = x_.to(device)
mem_after_cuda = get_gpu_memory_map()
mem_increased = mem_after_cuda - mem_before
print(f"Increased to {mem_increased} MB.")
del x
gc.collect()
torch.cuda.empty_cache()
mem_after_cuda_clean = get_gpu_memory_map()
mem_increased_after_clean = mem_after_cuda_clean - mem_before
mem_released = mem_increased - mem_increased_after_clean
print(f"Increased to {mem_increased_after_clean} MB. Released: {mem_released} MB")

输出:

$ python test.py
Before Op: 136.0 MB
Increased to 1206.0 MB.
Increased to 1186.0 MB. Released: 20.0 MB

当把 x_ = torch.zeros(1024, 1024) 改为 x_ = torch.zeros(1024, 1024*20) 以后再运行该脚本, 得到:

$ python test.py
Before Op: 137.0 MB
Increased to 1266.0 MB.
Increased to 1186.0 MB. Released: 80.0 MB

如果删除

torch.cuda.empty_cache()

则得到

Before Op: 136.0 MB
Increased to 1206.0 MB.
Increased to 1186.0 MB. Released: 20.0 MB
Before Op: 1322.0 MB
Increased to 20.0 MB.
Increased to 20.0 MB. Released: 0.0 MB

可以看到, 一方面, 显存只有变量被清除且明确调用 torch.cuda.empty_cache() 以后才会被释放; 另一方面, 在脚本结束之前, PyTorch 仍然保留了一个 CUDA context 的显存.

多次训练

如果在循环里多次启动同一个模型的训练, 则情形略有不同. 下面这段脚本初始化了同一个模型两次, 并且分别将其和对应的输入载入到GPU上:

#...
class NaiveN(torch.nn.Module):
    def __init__(self, nx) -> None:
        super().__init__()
        self.lin1 = torch.nn.Linear(nx, nx)
        self.lin2 = torch.nn.Linear(nx, 1)
        self.act = torch.nn.ReLU()

    def forward(self, x):
        o = self.act(self.lin1(x))
        o = self.act(self.lin2(x))
        return o


dev_name = "cuda:0"

device = torch.device(dev_name)
nx = 1024

mem_before = get_gpu_memory_map()
print(f"Before: Mem({dev_name}) {mem_before} MB")

mems = [
    mem_before,
]
cmds_all = []
for k in range(2):
    model = NaiveN(nx)
    loss_func = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    x_ = torch.ones(1024, nx).float()
    y_ = torch.ones(1024, 1).float()

    cmds = [
        "x = x_.to(device)",
        "y = y_.to(device)",
        "model.to(device)",
        "optimizer.zero_grad()",
        "yp = model(x)",
        "loss = loss_func(yp, y)",
        "loss.backward()",
        "optimizer.step()",
        ["del loss", "torch.cuda.empty_cache()"],
        ["del yp", "torch.cuda.empty_cache()"],
        ["del y", "torch.cuda.empty_cache()"],
        ["del x", "torch.cuda.empty_cache()"],
        ["del model", "torch.cuda.empty_cache()"],
        ["del optimizer", "torch.cuda.empty_cache()"],
    ]

    for cmd in cmds:
        if type(cmd) is str:
            exec(cmd)
            cmd_s = cmd
        elif type(cmd) is list:
            cmd_s = '" and "'.join(cmd)
            for c in cmd:
                exec(c)
        mem_after = get_gpu_memory_map()
        mem_diff = mem_after - mem_before

        print(
            f'Loop{k}, After "{cmd_s}", Mem({dev_name}) increased {mem_diff} MB'
        )

输出:

$ python test.py
Before: Mem(cuda) 137.0 MB
Loop0, After "x = x_.to(device)", Mem(cuda) increased 1206.0 MB
Loop0, After "y = y_.to(device)", Mem(cuda) increased 1208.0 MB
Loop0, After "model.to(device)", Mem(cuda) increased 1208.0 MB
Loop0, After "optimizer.zero_grad()", Mem(cuda) increased 1208.0 MB
Loop0, After "yp = model(x)", Mem(cuda) increased 1218.0 MB
Loop0, After "loss = loss_func(yp, y)", Mem(cuda) increased 1218.0 MB
Loop0, After "loss.backward()", Mem(cuda) increased 1226.0 MB
Loop0, After "optimizer.step()", Mem(cuda) increased 1226.0 MB
Loop0, After "del loss" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop0, After "del yp" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop0, After "del y" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop0, After "del x" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop0, After "del model" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop0, After "del optimizer" and "torch.cuda.empty_cache()", Mem(cuda) increased 1204.0 MB
Loop1, After "x = x_.to(device)", Mem(cuda) increased 1224.0 MB
Loop1, After "y = y_.to(device)", Mem(cuda) increased 1226.0 MB
Loop1, After "model.to(device)", Mem(cuda) increased 1226.0 MB
Loop1, After "optimizer.zero_grad()", Mem(cuda) increased 1226.0 MB
Loop1, After "yp = model(x)", Mem(cuda) increased 1226.0 MB
Loop1, After "loss = loss_func(yp, y)", Mem(cuda) increased 1226.0 MB
Loop1, After "loss.backward()", Mem(cuda) increased 1226.0 MB
Loop1, After "optimizer.step()", Mem(cuda) increased 1226.0 MB
Loop1, After "del loss" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop1, After "del yp" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop1, After "del y" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop1, After "del x" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop1, After "del model" and "torch.cuda.empty_cache()", Mem(cuda) increased 1226.0 MB
Loop1, After "del optimizer" and "torch.cuda.empty_cache()", Mem(cuda) increased 1204.0 MB

可以看到, 在每个训练循环结束以后, 必须要删除所有相关的变量且释放缓存, GPU memory才会真正被清除. 如果漏掉了 del optimizer 则显存不会有任何变化.

一个尚未解决的问题

在某台含有多张GPU的机器上, 采用上面类似的方式序列地训练多个模型时, 我发现

device = torch.device('cuda')
device = torch.device(f'cuda:{some_rank}')

对显存释放的效应是不同的. 当使用前者时, 当前循环结束、开始下一个训练循环以后, 和上一个部分的结果一样, 显存保持不变. 然而, 当指定特定的显卡号 some_rank 以后, 每次开始一个新的训练循环时, 显存都会增长大约一个单次训练的大小. 这样持续几个循环以后, 卡的显存就爆了.

似乎有人碰到了类似的问题: discuss.pytorch.org: CUDA memory not being freed?. 但是现在没有时间去找出原因.