Neural Networks and Deep Learning

Google ML Bootcamp 2022/Coursera mission

Neural Networks and Deep Learning - 4 week 실습

피준 2022. 7. 8. 17:24

데이터 가공이나, 라이브러리 같은 부분 말고 직접적은 코드만 살펴보자

4주차 실습은 사진을 고양이인지 판단하는 모델 제작이다

이를 L개의 레이어를 가지는 모델로 구현해보자

먼저 모델을 보며 전체적인 흐름을 살펴보자

def L_layer_model(X, Y, layers_dims, learning_rate = 0.0075, num_iterations = 3000, print_cost=False):
    """
    Arguments:
    X -- data, numpy array of shape (num_px * num_px * 3, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    layers_dims -- list containing the input size and each layer size, of length (number of layers + 1).
    learning_rate -- learning rate of the gradient descent update rule
    num_iterations -- number of iterations of the optimization loop
    print_cost -- if True, it prints the cost every 100 steps
    
    Returns:
    parameters -- parameters learnt by the model. They can then be used to predict.
    """
    
    #채점과 결과 확인용 코드이다
    np.random.seed(1)
    costs = []
    #---------------------------
    
    # 변수 초기화
    parameters=initialize_parameters_deep(layers_dims)
    
    # 경사 하강법으로 parameter 최적화
    for i in range(0, num_iterations):

        # 1. Forward propagation
        # 마지막의 경우 활성 함수로 Sigmoid, 나머지는 렐루 사용함
        AL, caches = L_model_forward(X,parameters)
        
        # 2. compute cost
        cost=compute_cost(AL,Y)
        
        # 3. Backward propagation.
        grads=L_model_backward(AL,Y,caches)
        
        # 4. Update parameters.
        parameters = update_parameters(parameters,grads,learning_rate)
                
        # 결과 확인용 출력---------------------------------------------------
        if print_cost and i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)
    	#--------------------------------------------------------------------
    return parameters, costs

FP, 비용 계산, BP, parameter 업데이트 4과정을 반복하는 경사하강법을 사용한다

각 함수의 세부 구현을 살펴보자

Forward Propogation

def L_model_forward(X, parameters):
    """
    Arguments:
    X -- data, numpy array of shape (input size, number of examples)
    parameters -- initialize_parameters_deep()으로 초기화된 w,b
    
    Returns:
    AL -- 시그모이드에 넣은 A[L]
    caches -- list of caches containing:
                every cache of linear_activation_forward() (there are L of them, indexed from 0 to L-1)
    """

    caches = []
    A = X
    #w,b 2개이므로 2로 나누면 레이어 수가 된다
    L = len(parameters) // 2
    
    #L-1 레이어 까지는 렐루 함수로 FP가 진행된다
    for l in range(1, L):
        A_prev = A 
        A,cache=linear_activation_forward(A_prev,parameters['W'+str(l)],parameters['b'+str(l)],activation="relu")
        caches.append(cache)
        
    #마지막 레이어는 활성 함수로 sigmoid를 이용한다
    AL,cache=linear_activation_forward(A,parameters['W'+str(L)],parameters['b'+str(L)],activation="sigmoid")
    caches.append(cache)
          
    return AL, caches

def linear_activation_forward(A_prev, W, b, activation):
    """
    활성 함수에 따른 계산을 한다
    Arguments:
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A -- the output of the activation function, also called the post-activation value 
    cache -- a python tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev,W,b)
        A,activation_cache = sigmoid(Z)
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev,W,b)
        A,activation_cache = relu(Z)
    
    cache = (linear_cache, activation_cache)

    return A, cache

def linear_forward(A, W, b):
    """
    하나의 레이어에서 FP를 계싼한다
    Arguments:
    A -- activations from previous layer (or input data): (size of previous layer, number of examples)
    W -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    Z=np.dot(W,A)+b
    cache = (A, W, b)
    
    return Z, cache

Compute Cost

이진 분류이기에 아래 식을 사용하였다

def compute_cost(AL, Y):
    """
    Arguments:
    AL -- probability vector corresponding to your label predictions, shape (1, number of examples)
    Y -- true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost -- cross-entropy cost
    """
    
    m = Y.shape[1]

    cost=-np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL))/m    
    #numpy 실수로 변환
    #[[17]] -> 17 느낌
    cost = np.squeeze(cost)
    
    return cost

Backward Propogation

def L_model_backward(AL, Y, caches):
    """
    Arguments:
    AL -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if non-cat, 1 if cat)
    caches -- list of caches containing:
                every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads -- A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # after this line, Y is the same shape as AL
    
    # loss함수를 미분한 값이다
    # 위의 loss를 미분 해 보면 구할 수 있다
    dAL=-(np.divide(Y,AL)-np.divide(1-Y,1-AL))
    
    #마지막 레이어엔 sigmoid를 사용하였으니 따로 계산해 준다
    current_cache=caches[L-1]
    grads["dA"+str(L-1)],grads["dW"+str(L)],grads["db"+str(L)]\=
    	linear_activation_backward(dAL,current_cache,activation="sigmoid")
    
    # 나머지는 렐루를 경사하강 시켜준다
    for l in reversed(range(L-1)):
        current_cache=caches[l]
        grads["dA"+str(l)],grads["dW"+str(l+1)],grads["db"+str(l+1)]\=
        	linear_activation_backward(grads["dA"+str(l+1)],current_cache,activation="relu")
        
    return grads

def linear_activation_backward(dA, cache, activation):
    """
    활성함수에 따른 BP를 계산해 준다
    Arguments:
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    #dz=da*g'(z)이므로 활성함수에 따라 다르다
    if activation == "relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
        
    elif activation == "sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
        
    return dA_prev, dW, db

def linear_backward(dZ, cache):
    """
    단일레이어 BP를 계산해 준다
    Arguments:
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    #앞서 미분으로 왜 이런 식이 나오는지 배웠다
    dW=np.dot(dZ,A_prev.T)/m
    db=np.sum(dZ,axis=1,keepdims=True)/m
    dA_prev=np.dot(W.T,dZ)
    
    return dA_prev, dW, db

Update parameters

def update_parameters(params, grads, learning_rate):
    """
    경사하강의 결과를 parameter에 적용해 준다
    Arguments:
    params -- python dictionary containing your parameters 
    grads -- python dictionary containing your gradients, output of L_model_backward
    
    Returns:
    parameters -- python dictionary containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    parameters = params.copy()
    L = len(parameters)

    for l in range(L):
        parameters["W" + str(l+1)]=parameters["W" + str(l+1)]-learning_rate*grads["dW"+str(l+1)]
        parameters["b" + str(l+1)]=parameters["b" + str(l+1)]-learning_rate*grads["db"+str(l+1)]
   
   return parameters