NVIDIA GPU自动调度神经网络
NVIDIA GPU自动调度神经网络
对特定设备和工作负载进行自动调整对于获得最佳性能至关重要。这是有关如何使用自动调度器为NVIDIA GPU调整整个神经网络。
为了自动调整神经网络,将网络划分为小的子图,并对其进行独立调整。每个子图被视为一个搜索任务。任务调度程序可以对时间进行分片,并为这些任务动态分配时间资源。任务调度程序可以预测每个任务对端到端执行时间的影响,确定可以最大程度地减少执行时间的任务的优先级。
对于每个子图,使用compute声明tvm/python/topi获取张量表达式形式的计算DAG。使用自动调度器来构造此DAG的搜索空间,并搜索良好的调度(低级优化)。
与依靠手动模板定义搜索空间的基于模板的autotvm不同,自动调度程序不需要任何调度模板。换句话说,自动调度程序仅在其中使用tvm/python/topi计算声明,而不使用现有的调度模板。
本文无法在Windows或最新版本的macOS上运行。要使其运行,需要将本文的内容包装在一个if __name__ == "__main__":块中。
import numpy as np
import tvm
from tvm import relay, auto_scheduler
import tvm.relay.testing
from tvm.contrib import graph_runtime
定义网络
需要使用中继前端API定义网络。可以加载一些预定义的网络tvm.relay.testing。从MXNet,ONNX,PyTorch和TensorFlow加载模型。
对于卷积神经网络,尽管自动调度程序可以在任何布局下正常工作,但发现使用NHWC布局通常可以实现最佳性能。还使用自动调度程序对NHWC布局实施了更多优化。建议将模型转换为NHWC布局以使用自动调度程序。可以使用ConvertLayout传递在TVM中进行布局转换。
def get_network(name, batch_size, layout="NHWC", dtype="float32"):
"""Get the symbol definition and random weight of a network"""
# auto-scheduler prefers NHWC layout
if layout == "NHWC":
image_shape = (224, 224, 3)
elif layout == "NCHW":
image_shape = (3, 224, 224)
else:
raise ValueError("Invalid layout: " + layout)
input_shape = (batch_size,) + image_shape
output_shape = (batch_size, 1000)
if name.startswith("resnet-"):
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer,
batch_size=batch_size,
layout=layout,
dtype=dtype,
image_shape=image_shape,
)
elif name.startswith("resnet3d-"):
n_layer = int(name.split("-")[1])
mod, params = relay.testing.resnet.get_workload(
num_layers=n_layer,
batch_size=batch_size,
layout=layout,
dtype=dtype,
image_shape=image_shape,
)
elif name == "mobilenet":
mod, params = relay.testing.mobilenet.get_workload(
batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape
)
elif name == "squeezenet_v1.1":
assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout"
mod, params = relay.testing.squeezenet.get_workload(
version="1.1",
batch_size=batch_size,
dtype=dtype,
image_shape=image_shape,
)
elif name == "inception_v3":
input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3)
mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
elif name == "mxnet":
# an example for mxnet model
from mxnet.gluon.model_zoo.vision import get_model
assert layout == "NCHW"
block = get_model("resnet18_v1", pretrained=True)
mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype)
net = mod["main"]
net = relay.Function(
net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs
)
mod = tvm.IRModule.from_expr(net)
return mod, params, input_shape, output_shape
# Define the neural network and compilation target
network = "resnet-18"
batch_size = 1
layout = "NHWC"
target = tvm.target.Target("cuda")
dtype = "float32"
log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name)
提取搜索任务
接下来,从网络中提取搜索任务及其权重。任务的权重是该任务的子图在整个网络中的出现次数。通过使用权重,可以将网络的端到端延迟近似为,其中sum(latency[t] * weight[t])latency[t]是任务的延迟,weight[t]是任务的权重。任务调度程序将仅优化此目标。
# Extract tasks from the network
print("Extract tasks...")
mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype)
tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
for idx, task in enumerate(tasks):
print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key))
print(task.compute_dag)
输出:
Extract tasks...
========== Task 0 (workload key: ["d7b65649a4dd54becea0a52aabbc5af5", 1, 1000, 1, 1000]) ==========
placeholder = PLACEHOLDER [1, 1000]
T_softmax_maxelem(i0) max= placeholder[i0, k]
T_softmax_exp(i0, i1) = tir.exp((placeholder[i0, i1] - T_softmax_maxelem[i0]))
T_softmax_expsum(i0) += T_softmax_exp[i0, k]
T_softmax_norm(i0, i1) = (T_softmax_exp[i0, i1]/T_softmax_expsum[i0])
========== Task 1 (workload key: ["9847f8cc0b305137f49f2c5c0c8ab25d", 1, 512, 1000, 512, 1000, 1, 1000]) ==========
placeholder = PLACEHOLDER [1, 512]
placeholder = PLACEHOLDER [1000, 512]
T_dense(i, j) += (placeholder[i, k]*placeholder[j, k])
placeholder = PLACEHOLDER [1000]
T_add(ax0, ax1) = (T_dense[ax0, ax1] + placeholder[ax1])
========== Task 2 (workload key: ["69115f188984ae34ede37c3b8ca40b43", 1, 7, 7, 512, 1, 1, 1, 512]) ==========
placeholder = PLACEHOLDER [1, 7, 7, 512]
tensor(ax0, ax1, ax2, ax3) += placeholder[ax0, ((ax1*7) + rv0), ((ax2*7) + rv1), ax3]
tensor(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3]/(float32((select((bool)1, ((ax1 + 1)*7), (((ax1 + 1)*7) + 1)) - (ax1*7)))*float32((select((bool)1, ((ax2 + 1)*7), (((ax2 + 1)*7) + 1)) - (ax2*7)))))
========== Task 3 (workload key: ["ad6cecbf5d85cb1cda3c2bb7af170211", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
placeholder = PLACEHOLDER [1, 7, 7, 512]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 512, 512]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 7, 7, 512]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
placeholder = PLACEHOLDER [1, 1, 1, 512]
T_multiply(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3]*placeholder[ax0, 0, 0, ax3])
placeholder = PLACEHOLDER [1, 1, 1, 512]
T_add(ax0, ax1, ax2, ax3) = (T_multiply[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 4 (workload key: ["3a69f9fbc63760d99e36b4c17b3bfc57", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
placeholder = PLACEHOLDER [1, 7, 7, 512]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 512, 512]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 1, 1, 512]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 5 (workload key: ["d730bcd28f0920f6b97245e2a11bd8d6", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]) ==========
placeholder = PLACEHOLDER [1, 7, 7, 512]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 8)) && (i2 >= 1)) && (i2 < 8)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 16), ((floormod(floordiv(p, 4), 4)*2) + eps), ((floormod(p, 4)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 512, 512]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*4)*4) + (floordiv(h, 2)*4)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 7, 7, 512]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
========== Task 6 (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]) ==========
placeholder = PLACEHOLDER [1, 14, 14, 256]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
placeholder = PLACEHOLDER [3, 3, 256, 512]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 512]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 7 (workload key: ["f3b6c10fcc6ce01ff01add933e4d21e9", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
placeholder = PLACEHOLDER [1, 14, 14, 256]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 256, 256]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 14, 14, 256]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
placeholder = PLACEHOLDER [1, 1, 1, 256]
T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 8 (workload key: ["b8b52b9be9df6102466a22a014c44c1f", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
placeholder = PLACEHOLDER [1, 14, 14, 256]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 256, 256]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 1, 1, 256]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 9 (workload key: ["d374e472bd9d8164892b9e28a0a8cb59", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]) ==========
placeholder = PLACEHOLDER [1, 14, 14, 256]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 15)) && (i2 >= 1)) && (i2 < 15)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 49), ((floormod(floordiv(p, 7), 7)*2) + eps), ((floormod(p, 7)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 256, 256]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*7)*7) + (floordiv(h, 2)*7)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 14, 14, 256]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
========== Task 10 (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
placeholder = PLACEHOLDER [3, 3, 128, 256]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 256]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 11 (workload key: ["c4500b4e2fd04e695c32d2f31bbdc14a", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 128, 128]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 28, 28, 128]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 12 (workload key: ["e4cdf917b876dbdd64488c3818d9c141", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 128, 128]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 13 (workload key: ["dac19035dd5fe9424ee8617421b9c817", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 29)) && (i2 >= 1)) && (i2 < 29)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*2) + eps), ((floormod(p, 14)*2) + nu), ci]
B(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 4) == 2)), ..(OMITTED).. ormod(i, 4) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [4, 4, 128, 128]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 4) == 3) && (floormod(j, 2) == 1)), 1f, select(((floormod(i, 4) == 3) && (floormod(j, 2) == 0)), ..(OMITTED).. ct(((floormod(i, 4) == 0) && (floormod(j, 2) == 1)), 0f, select(((floormod(i, 4) == 0) && (floormod(j, 2) == 0)), 1f, 0f))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 2), floormod(w, 2), ((((n*14)*14) + (floordiv(h, 2)*14)) + floordiv(w, 2)), co]
placeholder = PLACEHOLDER [1, 28, 28, 128]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
========== Task 14 (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
placeholder = PLACEHOLDER [3, 3, 64, 128]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 128]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 15 (workload key: ["1e3c4211ffd2f2db91078ae4d04b779d", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)), ..(OMITTED).. (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [6, 6, 64, 64]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)), ..(OMITTED).. 6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]
placeholder = PLACEHOLDER [1, 56, 56, 64]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
placeholder = PLACEHOLDER [1, 1, 1, 64]
T_add(ax0, ax1, ax2, ax3) = (T_add[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 16 (workload key: ["b818b53148cd450f86569dfc3e04cb8a", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)), ..(OMITTED).. (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [6, 6, 64, 64]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)), ..(OMITTED).. 6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]
placeholder = PLACEHOLDER [1, 1, 1, 64]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 17 (workload key: ["3ea73fb9b0364374730d09e068821f95", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
data_pad(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 1) && (i1 < 57)) && (i2 >= 1)) && (i2 < 57)), placeholder[i0, (i1 - 1), (i2 - 1), i3], 0f)
input_tile(eps, nu, p, ci) = data_pad[floordiv(p, 196), ((floormod(floordiv(p, 14), 14)*4) + eps), ((floormod(p, 14)*4) + nu), ci]
B(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 6) == 5)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 6) == 4)), ..(OMITTED).. (floormod(j, 6) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 6) == 0)), 1f, 0f))))))))))))))))))))))))))))))))))))
data_pack(eps, nu, p, ci) += ((input_tile[r_a, r_b, p, ci]*B[r_a, eps])*B[r_b, nu])
placeholder = PLACEHOLDER [6, 6, 64, 64]
bgemm(eps, nu, p, co) += (data_pack[eps, nu, p, ci]*placeholder[eps, nu, co, ci])
A(i, j) = select(((floormod(i, 6) == 5) && (floormod(j, 4) == 3)), 1f, select(((floormod(i, 6) == 5) && (floormod(j, 4) == 2)), ..(OMITTED).. 6) == 0) && (floormod(j, 4) == 1)), 0f, select(((floormod(i, 6) == 0) && (floormod(j, 4) == 0)), 1f, 0f))))))))))))))))))))))))
inverse(vh, vw, p, co) += ((bgemm[r_a, r_b, p, co]*A[r_a, vh])*A[r_b, vw])
conv2d_winograd(n, h, w, co) = inverse[floormod(h, 4), floormod(w, 4), ((((n*14)*14) + (floordiv(h, 4)*14)) + floordiv(w, 4)), co]
placeholder = PLACEHOLDER [1, 56, 56, 64]
T_add(ax0, ax1, ax2, ax3) = (conv2d_winograd[ax0, ax1, ax2, ax3] + placeholder[ax0, ax1, ax2, ax3])
========== Task 18 (workload key: ["a5612fdeb9db4d579a75ec225ea4c06a", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]) ==========
placeholder = PLACEHOLDER [1, 112, 112, 64]
pad_temp(ax0, ax1, ax2, ax3) = tir.if_then_else(((((ax1 >= 1) && (ax1 < 113)) && (ax2 >= 1)) && (ax2 < 113)), placeholder[ax0, (ax1 - 1), (ax2 - 1), ax3], -3.40282e+38f)
tensor(ax0, ax1, ax2, ax3) max= pad_temp[ax0, ((ax1*2) + dh), ((ax2*2) + dw), ax3]
placeholder = PLACEHOLDER [1, 1, 1, 64]
T_add(ax0, ax1, ax2, ax3) = (tensor[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 19 (workload key: ["12b88bedece6984af589a28b43e0f3c4", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]) ==========
placeholder = PLACEHOLDER [1, 224, 224, 3]
PaddedInput(i0, i1, i2, i3) = tir.if_then_else(((((i1 >= 3) && (i1 < 227)) && (i2 >= 3)) && (i2 < 227)), placeholder[i0, (i1 - 3), (i2 - 3), i3], 0f)
placeholder = PLACEHOLDER [7, 7, 3, 64]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
placeholder = PLACEHOLDER [1, 1, 1, 64]
T_add(ax0, ax1, ax2, ax3) = (Conv2dOutput[ax0, ax1, ax2, ax3] + placeholder[ax0, 0, 0, ax3])
T_relu(ax0, ax1, ax2, ax3) = max(T_add[ax0, ax1, ax2, ax3], 0f)
========== Task 20 (workload key: ["7006235cfc29b73be524cf390ed5a977", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
placeholder = PLACEHOLDER [1, 1, 64, 64]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, (yy + ry), (xx + rx), rc]*placeholder[ry, rx, rc, ff])
========== Task 21 (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]) ==========
placeholder = PLACEHOLDER [1, 56, 56, 64]
PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
placeholder = PLACEHOLDER [1, 1, 64, 128]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
========== Task 22 (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]) ==========
placeholder = PLACEHOLDER [1, 28, 28, 128]
PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
placeholder = PLACEHOLDER [1, 1, 128, 256]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
========== Task 23 (workload key: ["f4380bb1dc62422a69ad4a1a9771f927", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]) ==========
placeholder = PLACEHOLDER [1, 14, 14, 256]
PaddedInput(i0, i1, i2, i3) = placeholder[i0, i1, i2, i3]
placeholder = PLACEHOLDER [1, 1, 256, 512]
Conv2dOutput(nn, yy, xx, ff) += (PaddedInput[nn, ((yy*2) + ry), ((xx*2) + rx), rc]*placeholder[ry, rx, rc, ff])
开始调整
设置一些选项来优化和启动搜索任务
- measure_ctx启动不同的测量过程以提供隔离。保护主进程免受测量期间GPU崩溃的影响,避免其它运行时冲突。
- min_repeat_ms定义每次测量中一次“重复”的最小持续时间。这样可以预热GPU,对于获得准确的测量结果是必不可少的。通常,建议值> = 300毫秒。
- num_measure_trials是在调整期间可以使用的测量试验的次数。可以将其设置为较小的数字(例如200),进行快速演示。实际上,建议将其设置为900 * len(tasks),使搜索收敛。例如,resnet-18中有24个任务,将其设置为20000。根据时间预算调整此参数。
- 将测量记录转储到日志文件RecordToFile中,这些测量记录可用于最好地查询历史记录,恢复搜索,进行更多分析。
- 有关更多参数auto_scheduler.TuningOptions, 请参见auto_scheduler.LocalRPCMeasureContext。
def run_tuning():
print("Begin tuning...")
measure_ctx = auto_scheduler.LocalRPCMeasureContext(repeat=1, min_repeat_ms=300, timeout=10)
tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
tune_option = auto_scheduler.TuningOptions(
num_measure_trials=200, # change this to 20000 to achieve the best performance
runner=measure_ctx.runner,
measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
)
tuner.tune(tune_option)
# We do not run the tuning in our webpage server since it takes too long.
# Uncomment the following line to run it by yourself.
# run_tuning()
笔记
调整期间说明打印的信息
在调整期间,控制台上会打印很多信息。用于调试目的。最重要的信息是任务调度程序的输出。下表是示例输出。
----------------------------------------------------------------------
------------------------------ [ Task Scheduler ]
----------------------------------------------------------------------
| ID | Latency (ms) | Speed (GFLOPS) | Trials |
-------------------------------------------------
| 0 | 0.005 | 0.88 | 64 |
| 1 | 0.010 | 99.10 | 64 |
| 2 | 0.006 | 0.00 | 64 |
| 3 | 0.145 | 979.78 | 384 |
| 4 | 0.130 | 1097.02 | 384 |
| 5 | 0.143 | 992.69 | 384 |
| 6 | 0.076 | 1526.86 | 192 |
| 7 | 0.115 | 999.44 | 320 |
| 8 | 0.079 | 1449.39 | 320 |
| 9 | 0.122 | 938.73 | 384 |
| 10 | 0.063 | 1832.98 | 192 |
| 11 | 0.072 | 1763.62 | 256 |
| 12 | 0.062 | 2036.40 | 192 |
| 13 | 0.068 | 1874.44 | 192 |
| 14 | 0.049 | 2346.50 | 128 |
| 15 | 0.076 | 1694.31 | 256 |
| 16 | 0.067 | 1933.30 | 448 |
| 17 | 0.076 | 1680.90 | 256 |
| 18 | 0.022 | 98.43 | 64 |
| 19 | 0.076 | 3112.55 | 192 |
| 20 | 0.013 | 2026.44 | 64 |
| 21 | 0.011 | 1136.69 | 64 |
| 22 | 0.013 | 992.47 | 64 |
| 23 | 0.020 | 627.56 | 64 |
-------------------------------------------------
Estimated total latency: 1.587 ms Trials: 4992 Used time : 13296 s Next ID: 3
下表列出了所有任务的延迟和(估计)速度。列出了所有任务的测量试验分配。最后一行显示这些任务的总加权延迟,可以粗略估计网络的端到端执行时间。最后一行还显示测量试验的总数,自动调整所花费的总时间,要调整的下一个任务的ID。
自动调度程序将尝试某些无效的调度,出现一些“ dmlc :: Error”和CUDA错误。继续进行调整,放心地忽略,这些错误与主要过程是隔离的。
笔记
提前终止调整
可以通过强制终止此过程来提前终止调整。在日志文件中为每个任务至少获得一个有效的调度,能够进行编译(下面的部分)。
编译和评估
自动调整后,可以使用发现的最佳调度表来编译网络。在自动调整期间,所有测量记录都将转储到日志文件中,读取日志文件并加载最佳调度。
# Compile with the history best
print("Compile...")
with auto_scheduler.ApplyHistoryBest(log_file):
with tvm.transform.PassContext(opt_level=3, config={"relay.backend.use_auto_scheduler": True}):
lib = relay.build(mod, target=target, params=params)
# Create graph runtime
ctx = tvm.context(str(target), 0)
module = graph_runtime.GraphModule(lib["default"](ctx))
data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype))
module.set_input("data", data_tvm)
# Evaluate
print("Evaluate inference time cost...")
ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500)
prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond
print("Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)))
输出:
Compile...
Evaluate inference time cost...
Mean inference time (std dev): 3.22 ms (0.02 ms)
其它技巧
- 调整过程中,自动调度器需要编译许多程序并从中提取功能。该部分占用大量CPU,建议使用具有多个内核的高性能CPU,加快搜索速度。
- 提取大型日志文件,仅保存最有用的记录。python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json
- 从上一个日志文件继续搜索。load_log_file在function中创建任务调度程序时,只需添加一个新参数run_tuning。tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)
- 如果有多个目标GPU,全部用于测量,并行化测量。了解如何使用RPC跟踪器和RPC服务器。要在自动调度使用RPC跟踪,用auto_scheduler.RPCRunner,更换转轮TuningOptions 。
NVIDIA GPU自动调度神经网络的更多相关文章
- GPU自动调度卷积层
GPU自动调度卷积层 本文对GPU使用自动调度程序. 与依靠手动模板定义搜索空间的基于模板的autotvm不同,自动调度程序不需要任何模板.用户只需要编写计算声明,无需任何调度命令或模板.自动调度程序 ...
- ARM CPU自动调度神经网络
ARM CPU自动调度神经网络 对特定设备和工作负载进行自动调度,对于获得最佳性能至关重要.通过RPC使用自动调度器为ARM CPU调度整个神经网络. 为了自动调度神经网络,将网络划分为小的子图,进行 ...
- 为x86 CPU自动调度神经网络
为x86 CPU自动调度神经网络 对特定设备和工作负载进行自动调试对于获得最佳性能至关重要.这是有关如何使用自动调度器为x86 CPU调试整个神经网络的文档. 为了自动调试神经网络,将网络划分为小的子 ...
- NVIDIA GPU的神经网络自动调度
NVIDIA GPU的神经网络自动调度 针对特定设备和工作负载的自动调整对于获得最佳性能至关重要.这是一个关于如何使用自动调度器为NVIDIA GPU调整整个神经网络的资料. 为了自动调整一个神经网络 ...
- NVIDIA GPU卷积网络的自动调谐
NVIDIA GPU卷积网络的自动调谐 针对特定设备和工作负载的自动调整对于获得最佳性能至关重要.这是关于如何为NVIDIA GPU调整整个卷积网络. NVIDIA GPU在TVM中的操作实现是以模板 ...
- 自动调度GPU的卷积层
自动调度GPU的卷积层 这是有关如何对GPU使用自动调度程序的文档. 与依靠手动模板定义搜索空间的基于模板的autotvm不同,自动调度程序不需要任何模板.用户只需要编写计算声明,而无需任何调度命令或 ...
- TVM自动调度器
TVM自动调度器 随着模型大小,算子多样性和硬件异构性的不断增长,优化深度神经网络的执行速度非常困难.从计算的角度来看,深度神经网络只是张量计算的一层又一层.这些张量计算(例如matmul和conv2 ...
- 助力深度学习!阿里开源可插拔 GPU 共享调度工具
根据 Gartner 对全球 CIO 的调查结果显示,人工智能将成为 2019 年组织革命的颠覆性力量.对于人工智能来说,算力即正义,成本即能力,利用 Docker 和 Kubernetes 代表云原 ...
- NVIDIA GPU Pascal架构简述
NVIDIA GPU Pascal架构简述 本文摘抄自英伟达Pascal架构官方白皮书:https://www.nvidia.com/en-us/data-center/resources/pasca ...
随机推荐
- 从UWP项目移植到WinUI桌面版你需要做哪些事情
就像文章标题说的我是打算写一篇从UWP移植到WinUI的帖子,本来打算是想写一篇WinUI的学习帖子,可是觉得市面上UWP的教程WPF的教程都是很多了,所以干脆就直接硬怼项目吧,先声明我不是来挖UWP ...
- 什么是响应式web设计
什么是响应式web设计 现在开发一个产品,基本上都会需要兼顾 PC端和 移动端. 一般有两种思路: 1.为每个终端做一个特定的版本,并给2级域名,根据终端环境调用不同的版本代码. 2.一个网站能够兼容 ...
- 【SpringBoot】SpringBoot2.x整合定时任务和异步任务处理
SpringBoot2.x整合定时任务和异步任务处理 一.项目环境 springboot2.x本身已经集成了定时任务模块和异步任务,可以直接使用 二.springboot常用定时任务配置 1.在启动类 ...
- hdu 1298 字典树 + DFS (模拟T9文本输入)
题意: 给你一些按键顺序,让你输出每一步中概率最大的那个单词,这里的概率计算方 法好好看看别弄错了,一开始就是因为弄错了,各种wa,比如 abc 1 ,ab 1,那么 ab 的概率就是2 ...
- UVA11137(立方数之和)
题意: 给你一个n(<=10000),问他如果由立方数之和组成,那么有多少种方法? 思路: 一个地推公式,d[i][j] 表示用不大于i的数字去组合j这个数字有多少种方 ...
- Windows系统应急响应
Windows 系统的应急事件,按照处理的方式,可分为下面几种类别: 病毒.木马.蠕虫事件 Web 服务器入侵事件 或 安装的第三方服务入侵事件. 系统入侵事件,如利用 Windows 的漏洞攻击入侵 ...
- 什么?这么精髓的View的Measure流程源码全解析,你确定不看看?
前言 Android开发中我们平时接触最多的是各种View, View是一个比较大的体系,包含了绘制流程.事件分发.各种动画.自定义View 等等.前几天我写了一篇事件分发源码解析的文章, 今天我们来 ...
- 【python】Leetcode每日一题-森林中的兔子
[python]Leetcode每日一题-森林中的兔子 [题目描述] 森林中,每个兔子都有颜色.其中一些兔子(可能是全部)告诉你还有多少其他的兔子和自己有相同的颜色.我们将这些回答放在 answers ...
- layui的loading加载中
var load = layer.load(1, { content: '数据加载中', shade: [0.4, '#393D49'], // time: 10 * 1000, success: f ...
- Outlook关闭时最小化
一:背景环境: 当使用Outlook的时候,不小心点关闭,会不能及时发现接收的新邮件. 二:解决方法: 利用KeepOutlookRunning.dll插件,可以实现,点击关闭时,outlook没有实 ...