crnn 코드 뜯어보기(2)

logi·2023년 11월 9일

Machine Learning

목록 보기

2/2

def crnn(self, max_width):
        def BidirectionnalRNN(inputs, seq_len):
            """
                Bidirectionnal LSTM Recurrent Neural Network part
            """

            with tf.variable_scope(None, default_name="bidirectional-rnn-1"):
                # Forward
                lstm_fw_cell_1 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_1 = rnn.BasicLSTMCell(256)

                inter_output, _ = tf.nn.bidirectional_dynamic_rnn(
                    lstm_fw_cell_1, lstm_bw_cell_1, inputs, seq_len, dtype=tf.float32
                )

                inter_output = tf.concat(inter_output, 2)

            with tf.variable_scope(None, default_name="bidirectional-rnn-2"):
                # Forward
                lstm_fw_cell_2 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_2 = rnn.BasicLSTMCell(256)

                outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    lstm_fw_cell_2,
                    lstm_bw_cell_2,
                    inter_output,
                    seq_len,
                    dtype=tf.float32,
                )

                outputs = tf.concat(outputs, 2)

            return outputs

        def CNN(inputs):
            """
                Convolutionnal Neural Network part
            """

            # 64 / 3 x 3 / 1 / 1
            conv1 = tf.layers.conv2d(
                inputs=inputs,
                filters=64,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # 2 x 2 / 1
            pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)

            # 128 / 3 x 3 / 1 / 1
            conv2 = tf.layers.conv2d(
                inputs=pool1,
                filters=128,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # 2 x 2 / 1
            pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)

            # 256 / 3 x 3 / 1 / 1
            conv3 = tf.layers.conv2d(
                inputs=pool2,
                filters=256,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # Batch normalization layer
            bnorm1 = tf.layers.batch_normalization(conv3)

            # 256 / 3 x 3 / 1 / 1
            conv4 = tf.layers.conv2d(
                inputs=bnorm1,
                filters=256,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # 1 x 2 / 1
            pool3 = tf.layers.max_pooling2d(
                inputs=conv4, pool_size=[2, 2], strides=[1, 2], padding="same"
            )

            # 512 / 3 x 3 / 1 / 1
            conv5 = tf.layers.conv2d(
                inputs=pool3,
                filters=512,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # Batch normalization layer
            bnorm2 = tf.layers.batch_normalization(conv5)

            # 512 / 3 x 3 / 1 / 1
            conv6 = tf.layers.conv2d(
                inputs=bnorm2,
                filters=512,
                kernel_size=(3, 3),
                padding="same",
                activation=tf.nn.relu,
            )

            # 1 x 2 / 2
            pool4 = tf.layers.max_pooling2d(
                inputs=conv6, pool_size=[2, 2], strides=[1, 2], padding="same"
            )

            # 512 / 2 x 2 / 1 / 0
            conv7 = tf.layers.conv2d(
                inputs=pool4,
                filters=512,
                kernel_size=(2, 2),
                padding="valid",
                activation=tf.nn.relu,
            )

            return conv7

        batch_size = None
        inputs = tf.placeholder(
            tf.float32, [batch_size, max_width, 32, 1], name="input"
        )

        # Our target output
        targets = tf.sparse_placeholder(tf.int32, name="targets")

        # The length of the sequence
        seq_len = tf.placeholder(tf.int32, [None], name="seq_len")

        cnn_output = CNN(inputs)
        reshaped_cnn_output = tf.squeeze(cnn_output, [2])
        max_char_count = cnn_output.get_shape().as_list()[1]

        crnn_model = BidirectionnalRNN(reshaped_cnn_output, seq_len)

        logits = tf.reshape(crnn_model, [-1, 512])
        W = tf.Variable(
            tf.truncated_normal([512, self.NUM_CLASSES], stddev=0.1), name="W"
        )
        b = tf.Variable(tf.constant(0.0, shape=[self.NUM_CLASSES]), name="b")

        logits = tf.matmul(logits, W) + b
        logits = tf.reshape(
            logits, [tf.shape(cnn_output)[0], max_char_count, self.NUM_CLASSES]
        )

        # Final layer, the output of the BLSTM
        logits = tf.transpose(logits, (1, 0, 2))

        # Loss and cost calculation
        loss = tf.nn.ctc_loss(
            targets, logits, seq_len, ignore_longer_outputs_than_inputs=True
        )

        cost = tf.reduce_mean(loss)

        # Training step
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001).minimize(cost)

        # The decoded answer
        decoded, log_prob = tf.nn.ctc_beam_search_decoder(
            logits, seq_len, merge_repeated=False
        )
        dense_decoded = tf.sparse_tensor_to_dense(
            decoded[0], default_value=-1, name="dense_decoded"
        )

        # The error rate
        acc = tf.reduce_mean(tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

        init = tf.global_variables_initializer()

        return (
            inputs,
            targets,
            seq_len,
            logits,
            dense_decoded,
            optimizer,
            acc,
            cost,
            max_char_count,
            init,
        )

crnn 함수는 max_width를 매개변수로 받고 nested function으로 BidirectionnalRNN(inputs, seq_len)과 CNN(inputs)을 가지고 있다.

우선 BidirectionnalRNN 부터 살펴보자.

BidirectionnalRNN은 매개변수로 inputs와 seq_len을 가지고 있고 with문으로 짜여진 두 개의 변수 스코프로 이루어져 있다.

변수 스코프란

변수 스코프(Variable Scope)는 텐서플로에서 변수들을 조직화하고 관리하는 방법 중 하나이다. 변수 스코프를 사용하면 모델 내에서 변수들을 계층적으로 구성하고, 변수의 이름 충돌을 방지하며, 코드를 더 읽기 쉽게 만들 수 있다.

우리가 살펴보고 있는 BidirectionnalRNN의 코드를 통해 변수 스코프를 쓰는 방법을 자세히 알아보자.

with tf.variable_scope(None, default_name="bidirectional-rnn-1"):
                # Forward
                lstm_fw_cell_1 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_1 = rnn.BasicLSTMCell(256)

                inter_output, _ = tf.nn.bidirectional_dynamic_rnn(
                    lstm_fw_cell_1, lstm_bw_cell_1, inputs, seq_len, dtype=tf.float32
                )

                inter_output = tf.concat(inter_output, 2)
                
with tf.variable_scope(None, default_name="bidirectional-rnn-2"):
                # Forward
                lstm_fw_cell_2 = rnn.BasicLSTMCell(256)
                # Backward
                lstm_bw_cell_2 = rnn.BasicLSTMCell(256)

                outputs, _ = tf.nn.bidirectional_dynamic_rnn(
                    lstm_fw_cell_2,
                    lstm_bw_cell_2,
                    inter_output,
                    seq_len,
                    dtype=tf.float32,
                )

                outputs = tf.concat(outputs, 2)

변수 스코프는 default_name으로 각각 "bidirectional-rnn-1", "bidirectional-rnn-2"를 가지고 있다.
따라서 이름이 지정되지 않았을 때 lstm_fw_cell_1, lstm_bw_cell_1, inter_output은 bidirectional-rnn-1 변수 스코프 안에서, lstm_fw_cell_2, lstm_bw_cell_2와 outputs는 bidirectional-rnn-2 변수 스코프 안에서 생성되고 변수 스코프의 이름을 접두어로 가지게 된다.

변수 스코프의 개념을 이해했으니 이제 변수 스코프 내에 선언된 변수들에 대해 자세히 알아보자.

lstm_fw_cell_1: rnn.BasicLSTMCell(256)이 256개의 유닛을 가진 전방향 LSTM 셀을 생성하여 저장한다.
lstm_bw_cell_1: rnn.BasicLSTMCell(256)이 256개의 유닛을 가진 후방향 LSTM 셀을 생성하여 저장한다.

여기서 lstm이라는 생소한 용어가 나와 당황했는데,
LSTM, Bidirectional LSTM
다음의 블로그 글을 보고 해결했다. LSTM은 RNN의 한 종류일 뿐이었다. RNN은 많은 시간이 지나면 이전의 input을 잊어버린다는 단점이 있었는데, 이것을 'RNN의 장기의존문제'라고 부른다고 한다. 여기서 LSTM이라는 모델은 기억 셀(memory cell)을 추가하면서 이 문제를 해결했다. 우리가 지금 분석하고 있는 이 코드에서도 LSTM을 사용하여 RNN을 구현해 주려고 하는 것 같다.

하지만 LSTM을 직접 구현해주는 형태가 아니라, 코드의 첫부분에서 from tensorflow.contrib import rnn 문으로 tensorflow의 contrib라는 라이브러리에서 rnn을 import 해오는 식으로 구현하였기에 모든 모델을 직접 구현하려고 하는 우리 팀의 입장에서는 그대로 쓰기 곤란하다는 생각이 들었다. 그리고 우리가 사용하려고 하는 데이터셋 특성상 rnn의 장기의존문제가 발생할 정도로 input이 길지는 않을 것 같아 그냥 상대적으로 간단해 보이는 rnn의 구현이 더 좋을 것 같기도 하다. 물론 구현해봐야 알겠지만 말이다.

outputs, _ = tf.nn.bidirectional_dynamic_rnn(
lstm_fw_cell_2,
lstm_bw_cell_2,
inter_output,
seq_len,
dtype=tf.float32,
)

가장 이해하기 어려웠던 부분이었다. outputs, 옆에 붙은 _의 존재가 이해되지 않았다. 이 부분은 알고 보니 반환값이 2개인 tf.nn.bidirectional_dynamic_rnn 함수의 특성 때문이었다. 이 코드에서는 tf.nn.bidirectional_dynamic_rnn가 반환하는 두 개의 텐서 중 첫 번째만 사용하기로 하고 두 번째 텐서는 무시하기 위해 _ 를 사용했다.

batch_size = None
        inputs = tf.placeholder(
            tf.float32, [batch_size, max_width, 32, 1], name="input"
        )

batch size는 한 interation 안에 돌아갈 훈련 데이터의 양이다. batch_size = None으로 설정해줬다는 것은 batch size를 고정해주지 않았다는 것을 의미한다. 이렇게 하면 batch size는 나중에 모델이 실행될 때 정해줘야 한다.

tf.placeholder

tf.placeholder함수는 자료형의 일종이다. 일반적인 자료형은 아니고 다른 tensor를 placeholder에 매핑시키는 것이다. 'placeholder'의 뜻이 '자리 표시자'라는 것을 생각해 보면 감이 좀 온다.
placeholder 문법
다음 블로그를 참고해 이해하였다.

첫번째 매개변수 tf.float32는 placeholder에 들어갈 매개변수, 두번째 매개변수 [batch_size, max_width, 32, 1]는 placeholder의 형태, 그리고 세번째 매개변수 name="input"는 placeholder의 이름을 정해준다.

  		#Our target output
        targets = tf.sparse_placeholder(tf.int32, name="targets")

        # The length of the sequence
        seq_len = tf.placeholder(tf.int32, [None], name="seq_len")

cnn 출력값을 BidirectionnalRNN에 넣어주기

targets = tf.sparse_placeholder(tf.int32, name="targets")은 목표 출력에 대한 sparse_placeholder를 생성한다. tf.sparse_placeholder 함수는 sparse tensor를 처리하는 데 사용됩니다. 희소 텐서의 값의 데이터 유형은 tf.int32로 설정되어 있다.

cnn_output = CNN(inputs)
reshaped_cnn_output = tf.squeeze(cnn_output, [2])
max_char_count = cnn_output.get_shape().as_list()[1]

cnn_output = CNN(inputs): cnn에 의해 처리된 결과를 cnn_output에 저장한다.
reshaped_cnn_output = tf.squeeze(cnn_output, [2]): tf.squeeze 함수는 특정 차원에서 크기가 1인 차원을 제거하여 데이터를 압축하는 역할을 한다. 이 코드에서는 cnn_output에서 세 번째 차원([2])에서 크기가 1인 차원을 제거하였다.
max_char_count = cnn_output.get_shape().as_list()[1]: cnn_output의 형태(shape) 정보를 사용하여 출력 데이터의 두 번째 차원의 크기를 가져온다(이 값은 최대 문자 수를 나타낸다).
코드에서는 get_shape().as_list()를 사용하여 텐서의 형태를 리스트로 변환하고, 그 중에서 두 번째 요소를 선택하여 최대 문자 수를 얻는다.

종합하면, 이 코드는 CNN을 사용하여 입력 데이터를 처리하고, 그 결과를 압축하여 다루기 쉬운 형태로 만든다. 그리고 최대 문자 수를 max_char_count 변수에 저장한다.

crnn_model = BidirectionnalRNN(reshaped_cnn_output, seq_len)

BidirectionnalRNN에 처리된 CNN의 출력과 시퀀스 길이를 넣어 초기화해 crnn_model에 넣는다.

Bi-RNN 출력의 최종 분류

crnn_model = BidirectionnalRNN(reshaped_cnn_output, seq_len) 
#앞에서 살펴본 코드. crnn_model이 Bi-RNN의 출력을 받는다.

logits = tf.reshape(crnn_model, [-1, 512])
W = tf.Variable(
	tf.truncated_normal([512, self.NUM_CLASSES], stddev=0.1), name="W"
) # 가중치 설정
b = tf.Variable(tf.constant(0.0, shape=[self.NUM_CLASSES]), name="b") #편향 설정

tf.reshape(crnn_model, [-1, 512]): tf.reshape 함수는 crnn_model에 담긴 Bi-RNN의 출력값을 2D 텐서로 변환한다. 변환된 텐서는 logits에 저장된다.
W = tf.Variable( tf.truncated_normal([512, self.NUM_CLASSES], stddev=0.1), name="W" ): 신경망의 가중치 설정
b = tf.Variable(tf.constant(0.0, shape=[self.NUM_CLASSES]), name="b"): 신경망의 편향 설정

logi

관성을 붙이자

이전 포스트