[딥러닝 기초] 신경망 코딩하기 (텐서플로우, 파이토치 활용)

2021. 5. 19. 21:05노트/Python : 프로그래밍

출처 

 

https://fullstackdeeplearning.com/spring2021/notebook-1/

 

Full Stack Deep Learning

Hands-on program for software developers familiar with the basics of deep learning seeking to expand their skills.

fullstackdeeplearning.com

Full Stack Deep Learning  강의 내용.

 

Deep Learning Fundamentals

Enviroment

!python --version
>>> Python 3.8.3

!pip list | findstr "tensorflow"
>>> tensorflow                         2.3.1
    tensorflow-estimator               2.3.0
    
!pip list | findstr "torch"
>>> torch                              1.8.1

Basic numerical computing

import numpy as np 

# Initialize a numpy ndarray with 3 rows , 2 columns
X = np.zeros((3,2))
X
>>> array([[0., 0.],
           [0., 0.],
           [0., 0.]])
          
# By default, ndarrays are float64
X.shape, X.dtype
>>> ((3, 2), dtype('float64'))

# We can set values of a whole row 
X[0,:] = 1
X
>>> array([[1., 1.],
           [0., 0.],
           [0., 0.]])
           
# We can set values of a whole column
X[:,0] = 2
X
>>> array([[2., 1.],
           [2., 0.],
           [2., 0.]])
         
X = np.array([
    [1,2],
    [3,4],
    [5,6]
])
X
>>> array([[1, 2],
           [3, 4],
           [5, 6]])
           
x = np.array([10,20])
print(x)
print(X.shape, x.shape)

# We can add ndarrays pof differnet dimensions
X + x
>>> [10 20]
    (3, 2) (2,)
    array([[11, 22],
           [13, 24],
           [15, 26]])
           
X.shape, x.shape
>>> ((3, 2), (2,))

# Element-wise multiplication
X * x
>>> array([[ 10,  40],
           [ 30,  80],
           [ 50, 120]])	
           
# Matrix multiplication
# http://matrixmultiplication.xyz/

x = np.array([[10,20],]).T
result = X @ x # alternatively, np.dot(X,x)
result
>>> array([[ 50],
    	   [110],
       	   [170]])
           

Indexing

X = np.random.rand(3,2)
X
>>> array([[0.52409791, 0.48565582],
           [0.07866932, 0.64140832],
           [0.64864191, 0.2856841 ]])
           
X > 0.5
>>> array([[ True, False],
           [False,  True],
           [ True, False]])
           
X[X > 0.5] = 1 
X
>>> array([[1.        , 0.48565582],
           [0.07866932, 1.        ],
           [1.        , 0.2856841 ]])

Basic plotting

import matplotlib.pyplot as plt 
plt.style.use('dark_background')
plt.set_cmap('gray')
X = np.random.rand(100,100)
plt.matshow(X)
plt.colorbar(

x = np.linspace(0,100)
y = x * 5 + 10
# y = x * w + b 

plt.plot(x, y , 'x-')

Basic regression with a linear model

# x is 1-dimensional

n = 50
d = 1
x = np.random.uniform(-1,1, (n,d))

# y = 5x +10
weights_true = np.array([[5],])
bias_true = np.array([10])

y_true = x @ weights_true + bias_true 
print(f'x: {x.shape}, weights: {weights_true.shape}, bias: {bias_true.shape}, y: {y_true.shape}')

plt.plot(x, y_true, marker = 'x', label = 'underlying function')
plt.legend()

>>> x: (50, 1), weights: (1, 1), bias: (1,), y: (50, 1)

Basic prediction function: Linear

  • weights에 sqrt(n/2)를 나누는 이유 : He normal initialization
  • (cf) Xavier Initialization의 변형. Relu 활성화 함수 사용시 weights 분포가 대부분 0이 되는 Collapsing 현상 해결
# Let's initialize our predictions 

class Linear:
    def __init__(self, input_dim, num_hidden = 1):
        # The initialization is important to properly deal with different
        # input sizes (otherwise gradients quickly go to 0)
        self.weights = np.random.randn(input_dim, num_hidden) * np.sqrt(2. / input_dim)
        self.bias = np.zeros(num_hidden)
        
    def __call__(self, x):
        return x @ self.weights + self.bias 
    
linear = Linear(d)
y_pred = linear(x)
plt.plot(x, y_true, marker = 'x', label = 'underlying function')
plt.scatter(x, y_pred, color = 'r', marker = '.', label = 'our function')
plt.legend()

Basic loss function: MSE

# How wrong are these initial predicitons, exactly? 
# It's up to us. and our definition is called the loss function.
# Let's use Mean Squared Error (MSE) as our loss function.

class MSE:
    def __call__(self, y_pred, y_true):
        self.y_pred = y_pred
        self.y_true = y_true
        return ((y_true - y_pred) ** 2).mean()
    
loss = MSE()
print(f'Our initial loss is {loss(y_pred, y_true)}')
>>> Our initial loss is 100.69440815608257

Add back propagation

# Let's use gradient descent to learn the weights and bias that minimizes the loss function.
# For this, we need the gradient of the loss function and the gradients of the linear function

class MSE:
    def __call__(self, y_pred, y_true):
        self.y_pred = y_pred 
        self.y_true = y_true
        return ((y_pred - y_true) ** 2).mean()
    
    def backward(self):
        n = self.y_true.shape[0]
        self.gradient = 2. * (self.y_pred - self.y_true) / n 
        # print('MSE backward', self.y_pred.shape, self.y_true.shape, self.gradient.shape)
        return self.gradient
    
class Linear:
    def __init__(self, input_dim: int, num_hidden: int = 1):
        self.weights = np.random.randn(input_dim , num_hidden) * np.sqrt(2. / input_dim)
        self.bias = np.zeros(num_hidden)
        
    def __call__(self, x):
        self.x = x 
        output = x @ self.weights + self.bias 
        return output 
    
    def backward(self, gradient):
        self.weights_gradient = self.x.T @ gradient 
        self.bias_gradient = gradient.sum(axis = 0)
        self.x_gradient = gradient @ self.weights.T
        return self.x_gradient 
    
    def update(self, lr):
        self.weights = self.weights - lr * self.weights_gradient 
        self.bias = self.bias - lr * self.bias_gradient 
# Take one step forward and one step backward to make sure nothing breaks, and that the loss decreases 
loss = MSE()
linear = Linear(d)
y_pred = linear(x)
print(loss(y_pred, y_true))
loss_gradient = loss.backward()
linear.backward(loss_gradient)
linear.update(lr = 0.1)
y_pred = linear(x)
print(loss(y_pred, y_true))
>>> 99.25684790237388
    65.6829600039985

Train using gradient descent!

plt.plot(x, y_true, marker = 'x', label = 'underlying function')

loss = MSE()
linear = Linear(d)

num_epochs = 40
lr = 0.1

for epoch in range(num_epochs):
    y_pred = linear(x)
    loss_value = loss(y_pred, y_true)
    
    if epoch % 5 == 0:
        print(f'Epoch {epoch}, loss {loss_value}')
        plt.plot(x, y_pred.squeeze(), label = f'Epoch {epoch}')
        
    gradient_from_loss = loss.backward()
    linear.backward(gradient_from_loss)
    linear.update(lr)
    
plt.legend(bbox_to_anchor = (1.04, 1), loc = 'upper left')
>>> Epoch 0, loss 101.35137828206379
    Epoch 5, loss 16.80663538956683
    Epoch 10, loss 4.904307761295339
    Epoch 15, loss 2.1082256012327534
    Epoch 20, loss 1.029317768270935
    Epoch 25, loss 0.5177326671189527
    Epoch 30, loss 0.26206113627788236
    Epoch 35, loss 0.13282150949031205

2-dimensional inputs work, too

# What about 2-dimensional x?
n = 100
d = 2
x = np.random.uniform(-1, 1, (n,d))

# y = w * x + b 
# y = w_0 * x_0 + w_1 * x_1 + b 
# y = w @ x + b 

weights_true = np.array([[2,-1],]).T
bias_true = np.array([0.5])
print(x.shape, weights_true.shape, bias_true.shape)

y_true = x @ weights_true + bias_true
print(f'x: {x.shape}, weights: {weights_true.shape}, bias: {bias_true.shape}, y: {y_true.shape}')

def plot_3d(x, y, y_pred = None):
    import matplotlib.pyplot as plt 
    from mpl_toolkits.mplot3d import Axes3D
    fig = plt.figure()
    ax = fig.add_subplot(111, projection = '3d')
    ax.scatter(x[:,0], x[:, 1], y, label = 'underlying function')
    if y_pred is not None:
        ax.scatter(x[:, 0], x[:, 1], y_pred, label = 'our function')
    plt.legend()
    
plot_3d(x, y_true)
>>> (100, 2) (2, 1) (1,)
    x: (100, 2), weights: (2, 1), bias: (1,), y: (100, 1)

loss = MSE()
linear = Linear(2)
y_pred = linear(x)
print(loss(y_pred, y_true))
fig = plot_3d(x, y_true, y_pred)
>>> 0.5366050825731036

from typing import Callable

def fit(x: np.ndarray, y:np.ndarray, model: Callable, loss: Callable, lr: float, num_epochs : int):
    for epoch in range(num_epochs):
        y_pred = model(x)
        loss_value = loss(y_pred, y)
        print(f'Epoch {epoch}, loss {loss_value}')
        gradient_from_loss = loss.backward()
        model.backward(gradient_from_loss)
        model.update(lr)
        
fit(x, y_true, model = linear , loss = loss , lr = 0.1, num_epochs = 20)
plot_3d(x, y_true, linear(x))

>>> Epoch 0, loss 0.5366050825731036
    Epoch 1, loss 0.41636693987674256
    Epoch 2, loss 0.3304721176663387
    Epoch 3, loss 0.267652161172343
    Epoch 4, loss 0.22056161229265836
    Epoch 5, loss 0.18438186670221607
    Epoch 6, loss 0.1559257576601149
    Epoch 7, loss 0.1330628017811073
    Epoch 8, loss 0.11434985358635925
    Epoch 9, loss 0.0987933920128012
    Epoch 10, loss 0.08569620588822582
    Epoch 11, loss 0.07455823354199605
    Epoch 12, loss 0.06501218490341787
    Epoch 13, loss 0.0567815343631521
    Epoch 14, loss 0.049652928247107396
    Epoch 15, loss 0.04345790356532962
    Epoch 16, loss 0.038060641701995614
    Epoch 17, loss 0.033349651121144255
    Epoch 18, loss 0.029232023257435083
    Epoch 19, loss 0.02562938675711976

Basic regression with a Multi-layer Perceptron

So, we now have a way to automatically fit a linear function to N-dimensional data.

How can this be made to work for non-linear data?

 

# Make non- linear data 

n = 200
d = 2 
x = np.random.uniform(-1,1, (n,d))

weights_true = np.array([[5, 1],]).T 
bias_true = np.array([1])

y_true = (x ** 2) @ weights_true + x @ weights_true + bias_true 
print(f'x: {x.shape}, weights: {weights_true.shape}, bias: {bias_true.shape}, y: {y_true.shape}')

plot_3d(x,y_true)
>>> x: (200, 2), weights: (2, 1), bias: (1,), y: (200, 1)

# We can train just fine, but the final loss will remain high, as our linear function is incapable
# of representing the data.

loss = MSE()
linear = Linear(d)
fit(x, y_true, model = linear , loss = loss, lr = 0.1, num_epochs = 40)
plot_3d(x, y_true, linear(x))
>>> Epoch 0, loss 19.328538716017466
    Epoch 1, loss 15.216957555203658
    Epoch 2, loss 12.342777101101431
    Epoch 3, loss 10.289255618909142
    Epoch 4, loss 8.786259983414807
    Epoch 5, loss 7.657895738011214
    Epoch 6, loss 6.788941240314208
    Epoch 7, loss 6.103322253955871
    Epoch 8, loss 5.550297600252665
    Epoch 9, loss 5.095584464600836
    Epoch 10, loss 4.7156491200374715
    Epoch 11, loss 4.394027075225822
    Epoch 12, loss 4.118945184589631
    Epoch 13, loss 3.881779767042434
    Epoch 14, loss 3.6760521897826504
    Epoch 15, loss 3.496770555800361
    Epoch 16, loss 3.3399947656774227
    Epoch 17, loss 3.202546179981643
    Epoch 18, loss 3.081811267814911
    Epoch 19, loss 2.9756066733002586
    Epoch 20, loss 2.8820847027395065
    Epoch 21, loss 2.7996656593417084
    Epoch 22, loss 2.7269882203588236
    Epoch 23, loss 2.6628721173934617
    Epoch 24, loss 2.6062893555123448
    Epoch 25, loss 2.556341481832526
    Epoch 26, loss 2.5122412400199456
    Epoch 27, loss 2.4732974841880715
    Epoch 28, loss 2.4389025768993617
    Epoch 29, loss 2.4085217273341977
    Epoch 30, loss 2.3816838795419124
    Epoch 31, loss 2.3579738642244257 
    Epoch 32, loss 2.337025598218765
    Epoch 33, loss 2.3185161650128356
    Epoch 34, loss 2.3021606445304683
    Epoch 35, loss 2.2877075857756615
    Epoch 36, loss 2.274935034803059
    Epoch 37, loss 2.263647044892634
    Epoch 38, loss 2.253670607077198
    Epoch 39, loss 2.2448529481848727

Add non-linearity: ReLU

np.clip(array, min, max)

: array내의 element들에 대해

min 값보다 작으면 min으로 바꾸고

max값 보다 큰 값들은 max 값으로 바꿔주는 함수

# In order to learn no-linear functions, we need non-linearities in our model. 

class Relu:
    def __call__(self, input_):
        self.input_ = input_
        self.output = np.clip(self.input_, 0, None) 
        return self.output
    
    def backward(self, output_gradient):
        # import pdb; pdb.set_trace() # By the way, this is how you can debug 
        self.input_gradient = (self.input_ > 0) * output_gradient 
        return self.input_gradient 
    
relu = Relu()
input_ = np.expand_dims(np.array([1, 0.5, 0, -0.5, -1]), -1)
print(relu(input_))
print(relu.backward(input_))
>>> [[1. ]
     [0.5]
     [0. ]
     [0. ]
     [0. ]]
    [[ 1. ]
     [ 0.5]
     [ 0. ]
     [-0. ]
     [-0. ]]

Train our new non-linear model

class Model:
    def __init__(self, input_dim, num_hidden):
        self.linear1 = Linear(input_dim, num_hidden)
        self.relu = Relu()
        self.linear2 = Linear(num_hidden, 1)
        
    def __call__(self, x):
        l1 = self.linear1(x)
        r = self.relu(l1)
        l2 = self.linear2(r)
        return l2 
    
    def backward(self, output_gradient):
        linear2_gradient = self.linear2.backward(output_gradient)
        relu_gradient = self.relu.backward(linear2_gradient)
        linear1_gradient = self.linear1.backward(relu_gradient)
        # print('Model backward', linear2_gradient.shape, relu_gradient.shape, linear1_gradient.shape)
        # import pdb ; pdb.set_trace()
        return linear1_gradient
    
    def update(self, lr):
        self.linear2.update(lr)
        self.linear1.update(lr)
        
loss = MSE()
model = Model(d, 10)
y_pred = model(x)
loss_value = loss(y_pred, y_true)
loss_gradient = loss.backward()
print(loss_value)
model.backward(loss_gradient)
plot_3d(x, y_true, y_pred)

>>> 20.426093114633776

# Test just one forward and backward step 
loss = MSE()
model = Model(d, 10)
y_pred = model(x)
loss_value = loss(y_pred, y_true)
print(loss_value)
loss_gradient - loss.backward()
model.backward(loss_gradient)
model.update(0.1)
y_pred = model(x)
loss_value = loss(y_pred, y_true)
print(loss_value)
>>> 17.92069472815522
    8.80429405983048
fit(x, y_true, model = model, loss = loss, lr = 0.1, num_epochs = 40)
plot_3d(x, y_true, model(x))
>>> Epoch 0, loss 8.80429405983048
    Epoch 1, loss 5.085918541816841
    Epoch 2, loss 3.4039159969134847
    Epoch 3, loss 2.4681281091920857
    Epoch 4, loss 1.8485284667459465
    Epoch 5, loss 1.4276282932172117
    Epoch 6, loss 1.1327997420829552
    Epoch 7, loss 0.9290792976378728
    Epoch 8, loss 0.7799888220358488
    Epoch 9, loss 0.6699261327956559
    Epoch 10, loss 0.587296519858608
    Epoch 11, loss 0.5238686972933833
    Epoch 12, loss 0.4732083016075399
    Epoch 13, loss 0.4317766396612485 
    Epoch 14, loss 0.39699028302697154
    Epoch 15, loss 0.3673241810620796
    Epoch 16, loss 0.341319222192828
    Epoch 17, loss 0.3186440500997158
    Epoch 18, loss 0.2987895524364333
    Epoch 19, loss 0.2811179639881686
    Epoch 20, loss 0.2651405126234886
    Epoch 21, loss 0.25097702320765514
    Epoch 22, loss 0.23826000298362504
    Epoch 23, loss 0.22677932884242566
    Epoch 24, loss 0.21660060154462232
    Epoch 25, loss 0.20750622546778963
    Epoch 26, loss 0.1991320633827995
    Epoch 27, loss 0.1916063151126626
    Epoch 28, loss 0.18469121783326492
    Epoch 29, loss 0.1785179516110991
    Epoch 30, loss 0.17301685884748444
    Epoch 31, loss 0.16808495165640042
    Epoch 32, loss 0.1636889213000724
    Epoch 33, loss 0.15959886847872123
    Epoch 34, loss 0.15594862866578224
    Epoch 35, loss 0.15267198587910785
    Epoch 36, loss 0.14966129092702957
    Epoch 37, loss 0.14696735509994738
    Epoch 38, loss 0.14449833219759006
    Epoch 39, loss 0.1421380783477851

Same thing, in PyTorch

super()로 기반 클래스의 init 메서드 호출

import torch 
import torch.nn as nn 

class TorchModel(nn.Module):
    def __init__(self, input_dim, num_hidden):
        super().__init__() # torch 쓰려면 이렇게 기반 클래스의 모듈(nn.Module) 호출해야함. 
        self.linear1 = nn.Linear(input_dim, num_hidden)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(num_hidden, 1)
        
    def forward(self, x):
        l1 = self.linear1(x)
        r = self.relu(l1)
        l2 = self.linear2(r)
        return l2 
    
loss = nn.MSELoss()
model = TorchModel(d,10)
x_tensor = torch.tensor(x).float()
y_true_tensor = torch.tensor(y_true).float()
y_pred_tensor = model(x_tensor)
loss_value = loss(y_pred_tensor, y_true_tensor)
print(loss_value)
>>> tensor(19.5986, grad_fn=<MseLossBackward>)
# Test just one forward and backward step 
optimizer = torch.optim.SGD(model.parameters(), lr = 0.1)

optimizer.zero_grad()
y_pred_tensor = model(x_tensor)
loss_value = loss(y_pred_tensor, y_true_tensor)
print(loss_value)
loss_gradient = loss_value.backward()
optimizer.step()

y_pred_tensor = model(x_tensor)
loss_value = loss(y_pred_tensor, y_true_tensor)
print(loss_value)
>>> tensor(19.5986, grad_fn=<MseLossBackward>)
    tensor(10.0270, grad_fn=<MseLossBackward>)
# Now we run the training loop 

def torch_fit(x: np.ndarray, y: np.ndarray, model: Callable, loss: Callable, lr:float, num_epochs: int):
    optimizer = torch.optim.SGD(model.parameters(), lr = lr)
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        y_pred_tensor = model(x_tensor)
        loss_value = loss(y_pred_tensor, y_true_tensor)
        print(loss_value)
        loss_value.backward()
        optimizer.step()
        
torch_fit(x_tensor, y_true_tensor, model = model, loss = loss, lr = 0.1, num_epochs = 40)
plot_3d(x, y_true, model(x_tensor).detach())
>>> tensor(0.2399, grad_fn=<MseLossBackward>)
    tensor(0.2361, grad_fn=<MseLossBackward>)
    tensor(0.2326, grad_fn=<MseLossBackward>)
    tensor(0.2294, grad_fn=<MseLossBackward>)
    tensor(0.2263, grad_fn=<MseLossBackward>)
    tensor(0.2234, grad_fn=<MseLossBackward>)
    tensor(0.2205, grad_fn=<MseLossBackward>)
    tensor(0.2179, grad_fn=<MseLossBackward>)
    tensor(0.2154, grad_fn=<MseLossBackward>)
    tensor(0.2130, grad_fn=<MseLossBackward>)
    tensor(0.2107, grad_fn=<MseLossBackward>)
    tensor(0.2085, grad_fn=<MseLossBackward>)
    tensor(0.2064, grad_fn=<MseLossBackward>)
    tensor(0.2043, grad_fn=<MseLossBackward>)
    tensor(0.2023, grad_fn=<MseLossBackward>)
    tensor(0.2004, grad_fn=<MseLossBackward>)
    tensor(0.1985, grad_fn=<MseLossBackward>)
    tensor(0.1967, grad_fn=<MseLossBackward>)
    tensor(0.1949, grad_fn=<MseLossBackward>)
    tensor(0.1930, grad_fn=<MseLossBackward>)
    tensor(0.1910, grad_fn=<MseLossBackward>)
    tensor(0.1891, grad_fn=<MseLossBackward>)
    tensor(0.1872, grad_fn=<MseLossBackward>)
    tensor(0.1854, grad_fn=<MseLossBackward>)
    tensor(0.1836, grad_fn=<MseLossBackward>)
    tensor(0.1818, grad_fn=<MseLossBackward>)
    tensor(0.1800, grad_fn=<MseLossBackward>)
    tensor(0.1783, grad_fn=<MseLossBackward>)
    tensor(0.1766, grad_fn=<MseLossBackward>)
    tensor(0.1749, grad_fn=<MseLossBackward>)
    tensor(0.1730, grad_fn=<MseLossBackward>)
    tensor(0.1709, grad_fn=<MseLossBackward>)
    tensor(0.1688, grad_fn=<MseLossBackward>)
    tensor(0.1667, grad_fn=<MseLossBackward>)
    tensor(0.1647, grad_fn=<MseLossBackward>)
    tensor(0.1626, grad_fn=<MseLossBackward>)
    tensor(0.1605, grad_fn=<MseLossBackward>)
    tensor(0.1583, grad_fn=<MseLossBackward>)
    tensor(0.1562, grad_fn=<MseLossBackward>)
    tensor(0.1540, grad_fn=<MseLossBackward>)

Same thing, in Tensorflow/Keras

from tensorflow import keras 
from tensorflow.keras import layers 
from tensorflow.keras import optimizers 

inputs = keras.Input(shape=(2,))
l1 = layers.Dense(10, activation = 'relu', name = 'dense_1')(inputs)
outputs = layers.Dense(1, name = 'regression')(l1)

model = keras.Model(inputs = inputs, outputs = outputs)
print(model.summary())
model.compile(loss = 'mse', optimizer = optimizers.SGD(0.1))

model.fit(x, y_true, epochs = 10)
y_pred = model.predict(x)

plot_3d(x, y_true, model(x))