
 import numpy as np
N, D_in, H, D_out = 64, 1000, 100, 10
x = np.random.randn(N, D_in) # (64, 1000)
y = np.random.randn(N, D_out) # (64, 10)
w1 = np.random.randn(D_in, H) # (1000, 100)
w2 = np.random.randn(H, D_out) # (100, 10)
learning_rate = 1e-6 for t in range(2):
# Forward pass: compute predicted y
h = x.dot(w1) # (64, 100)
h_relu = np.maximum(h, 0) # (64, 100) 实现relu函数功能
y_pred = h_relu.dot(w2) # (64, 10) loss = np.square(y_pred - y).sum() # sum()所有元素求和
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.T.dot(grad_y_pred)
grad_h_relu = grad_y_pred.dot(w2.T)
grad_h = grad_h_relu.copy() # (64, 100)
grad_h[h < 0] = 0 # 在h中负元素对应位置处grad_h中置0 -> 实现relu函数功能
grad_w1 = x.T.dot(grad_h) # .T是转置 (1000, 100) # Update weights
w1 -= learning_rate * grad_w1 # (1000, 100)
w2 -= learning_rate * grad_w2


 import torch

 dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10 x = torch.randn(N, D_in).type(dtype)
y = torch.randn(N, D_out).type(dtype) # Randomly initialize weights
w1 = torch.randn(D_in, H).type(dtype)
w2 = torch.randn(H, D_out).type(dtype) learning_rate = 1e-6
for t in range(500):
# Forward pass: compute predicted y
h = x.mm(w1) # 与numpy对比,dot点乘
h_relu = h.clamp(min=0)
y_pred = h_relu.mm(w2) loss = (y_pred - y).pow(2).sum()
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = 2.0 * (y_pred - y)
grad_w2 = h_relu.t().mm(grad_y_pred)
grad_h_relu = grad_y_pred.mm(w2.t())
grad_h = grad_h_relu.clone()
grad_h[h < 0] = 0
grad_w1 = x.t().mm(grad_h) # Update weights using gradient descent
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2


 # use PyTorch Variables and autograd to implement our two-layer network; 
# now we no longer need to manually implement the backward pass through the network import torch
from torch.autograd import Variable dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10 # Setting requires_grad=False indicates that we do not need to compute gradients with respect to these Variables during the backward pass.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False) # Setting requires_grad=True indicates that we want to compute gradients with respect to these Variables during the backward pass.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True) learning_rate = 1e-6
for t in range(2):
# Forward pass: we do not need to keep references to intermediate values since we are not implementing the backward pass by hand
y_pred = x.mm(w1).clamp(min=0).mm(w2) # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape (1,); loss.data[0] is a scalar value holding the loss.
loss = (y_pred - y).pow(2).sum()
# print(loss) # [torch.FloatTensor of size 1]
# print(loss.size()) # torch.Size([1])
# print(loss.data) # [torch.FloatTensor of size 1]
print(loss.data[0]) loss.backward() w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data w1.grad.data.zero_()


 import torch
from torch.autograd import Variable class MyReLU(torch.autograd.Function):
def forward(self, input):
return input.clamp(min=0) def backward(self, grad_output):
input, = self.saved_tensors
grad_input = grad_output.clone()
grad_input[input < 0] = 0
return grad_input dtype = torch.FloatTensor
N, D_in, H, D_out = 64, 1000, 100, 10 x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False) w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True) learning_rate = 1e-6
for t in range(2):
relu = MyReLU() # Forward pass
y_pred = relu(x.mm(w1)).mm(w2) loss = (y_pred - y).pow(2).sum()
loss.backward() w1.data -= learning_rate * w1.grad.data
w2.data -= learning_rate * w2.grad.data w1.grad.data.zero_()


