외부 파일을 BST에 저장하기

오동환·2023년 4월 19일

알고리즘

Algorithm

목록 보기

18/23

✏️ tsv 형식의 연락처 파일을 읽어서 이름을 기준으로 BST에 저장하고, find, add, trace, delete, save 함수를 지원한다.

프로그램을 구현하면서 골치 아팠던 것들을 정리했다. (파일 입출력...)

1. BST

: BST 구현에 사용했던 코드를 가져와 일부 수정했다.

원래 입력받은 인자로 데이터를 초기화하고 그 데이터의 주소를 리턴하는 Data* init_data()함수가 존재했었다.

하지만 어느 이유에선지? 새로운 노드를 add 후 그 노드의 parent node를 delete 하면 추가한 노드의 데이터가 훼손되는 이상한 오류가 발생했고, 노드를 초기화할때 데이터도 같이 초기화 해주니 오류가 사라졌다.

Data 구조체와 새로운 Data 포인터가 다른 함수로 전달되는 과정에서 overlpa된 메모리가 delete 함수에서 노드를 free 하는 과정에서 문제가 발생한 게 아닐까? 추정중이다.

Delete 함수의 아래 라인에서 문제가 발생했다.
free(node_to_delete);

typedef struct Data {
	char* name;
	char* company;
	char* address;
	char* zip;
	char* phone;
	char* email;
}Data;

typedef struct Node {
	Data* data;
	struct Node* parent_node;
	struct Node* left_node;
	struct Node* right_node;
}Node;

typedef struct BST {
	Node* root;
}BST;

/* Do not use
void init_data(Data* data, char* name, char* company, char* address, char* zip, char* phone, char* email) {
	data->name = name;
	data->company = company;
	data->address = address;
	data->zip = zip;
	data->phone = phone;
	data->email = email;
}
*/

void init_node(Node* node, char* name, char* company, char* address, char* zip, char* phone, char* email) {
	Data* new_data = (Data*)malloc(sizeof(Data));
	new_data->name = name;
	new_data->company = company;
	new_data->address = address;
	new_data->zip = zip;
	new_data->phone = phone;
	new_data->email = email;

	node->data = new_data;
	node->parent_node = NULL;
	node->left_node = NULL;
	node->right_node = NULL;
}

void init_bst(BST* bst) {
	bst->root = NULL;
}

Node* create_node(char* name, char* company, char* address, char* zip, char* phone, char* email) {
	Node* temp = (Node*)malloc(sizeof(Node));
	init_node(temp, name, company, address, zip, phone, email);

	return temp;
}

void insert_node(BST* pbst, char* name, char* company, char* address, char* zip, char* phone, char* email) {
	Node* new_node = create_node(name, company, address, zip, phone, email);

	Node* prev_node = NULL; // new_node's parent node
	Node* current_node = pbst->root;

	while (current_node != NULL) {
		prev_node = current_node;
		if (strcmp(new_node->data->name, current_node->data->name) <= 0) {
			current_node = current_node->left_node;
		}
		else {
			current_node = current_node->right_node;
		}
	}
	new_node->parent_node = prev_node;

	// tree is empty
	if (prev_node == NULL) {
		pbst->root = new_node;
	}

	// determine left or right
	else if (strcmp(new_node->data->name, prev_node->data->name) <= 0) {
		prev_node->left_node = new_node;
	}
	else {
		prev_node->right_node = new_node;
	}
}


Node* bst_search(BST bst, char* target_name) {
	if (bst.root == NULL || strcmp(target_name, bst.root->data->name) == 0) {
		return bst.root;
	}

	BST left_subtree;
	BST right_subtree;
	init_bst(&left_subtree);
	init_bst(&right_subtree);
	left_subtree.root = bst.root->left_node;
	right_subtree.root = bst.root->right_node;

	if (strcmp(target_name, bst.root->data->name)< 0) {
		return bst_search(left_subtree, target_name);
	}
	else {
		return bst_search(right_subtree, target_name);
	}
}

Node* bst_minimum(BST bst) {
	if (bst.root == NULL) {
		return NULL;
	}

	BST left_subtree;
	init_bst(&left_subtree);
	left_subtree.root = bst.root->left_node;

	Node* result = bst.root;

	while (left_subtree.root != NULL) {
		result = left_subtree.root;
		left_subtree.root = left_subtree.root->left_node;
	}

	return result;
}

Node* successor(BST bst) {
	if (bst.root == NULL) {
		return NULL;
	}

	BST left_subtree;
	BST right_subtree;
	init_bst(&left_subtree);
	init_bst(&right_subtree);
	left_subtree.root = bst.root->left_node;
	right_subtree.root = bst.root->right_node;

	if (right_subtree.root != NULL) {
		return bst_minimum(right_subtree);
	}

	Node* parent_node = bst.root->parent_node;
	while (parent_node != NULL && bst.root == parent_node->right_node) {
		bst.root = parent_node;
		parent_node = bst.root->parent_node;
	}

	return parent_node;
}

void move_data(Node* target, Node* node_to_move) {
	target->data = node_to_move->data;
}

void delete_node(BST* pbst, char* target_name) {
	// set a target node
	Node* node_to_delete;
	BST target_subtree;
	target_subtree.root = bst_search(*pbst, target_name);

	if (target_subtree.root == NULL) {
		printf("Invalid target");
		return;
	}

	// the number of child is 0 or 1
	if (target_subtree.root->left_node == NULL || target_subtree.root->right_node == NULL) {
		node_to_delete = target_subtree.root;
	}
	else {
		node_to_delete = successor(target_subtree);
	}

	// set a node to move
	Node* node_to_move;

	if (node_to_delete->left_node != NULL) {
		node_to_move = node_to_delete->left_node;
	}
	else {
		node_to_move = node_to_delete->right_node;
	}

	// move the node
	if (node_to_move != NULL) {
		node_to_move->parent_node = node_to_delete->parent_node;
	}

	if (node_to_delete->parent_node == NULL) {
		// the target node is root node
		pbst->root = node_to_move;
	}
	else if (node_to_delete == node_to_delete->parent_node->left_node) {
		node_to_delete->parent_node->left_node = node_to_move;
	}
	else {
		node_to_delete->parent_node->right_node = node_to_move;
	}

	// the number of child is 2
	if (node_to_delete != target_subtree.root) {
		// The node to delete is a successor
		// move data of succesosr node
		move_data(target_subtree.root, node_to_delete);
	}

	free(node_to_delete);
}

2. 함수 구현

list
: BST에 저장된 연락처를 inorder 순서로, 오름차순으로 정렬하여 출력한다.
find
: BST에서 이름을 검색하여 정보를 출력한다.
add
: 입력한 이름이 없다면 company, address, zip, phone, email를 입력받아 BST에 저장한다.
trace
: BST에서 root부터 입력된 이름의 노드까지의 경로에 있는 이름들을 출력한다.
delete
: 입력된 이름의 데이터를 BST에서 삭제한다.

add와 delete가 문제가 가장 많았다. (특히 데이터 훼손)

새로운 데이터를 add했을 때 address만 훼손되어 저장되는 오류가 있었다.

gets로 입력을 받은 고정된 크기를 가진 배열을 그대로 추가할 노드의 데이터로 넘겨주는 것이 문제였다.

따라서 strlen을 이용해 문자의 길이만큼 동적 할당받아 그 포인터를 데이터로 넘겨주니 해결되었다.

원본 데이터의 필드는 \t로 구분되어있고, 마지막 필드인 email은 마지막에 개행문자(\n)를 가지고 있어서 email을 입력받을 때 마찬가지로 strlen을 이용해 개행문자와 널문자를 추가해주었다.

void list(BST bst) {
	if (bst.root == NULL) {
		return;
	}

	BST left_subtree;
	BST right_subtree;
	init_bst(&left_subtree);
	init_bst(&right_subtree);
	left_subtree.root = bst.root->left_node;
	right_subtree.root = bst.root->right_node;

	list(left_subtree);
	printf("%s\n", bst.root->data->name);
	printf("\tCompany: %s\n", bst.root->data->company);
	printf("\tAddress: %s\n", bst.root->data->address);
	printf("\tZipcode: %s\n", bst.root->data->zip);
	printf("\tPhones: %s\n", bst.root->data->phone);
	printf("\tEmail: %s\n", bst.root->data->email);
	list(right_subtree);
}


void find(BST bst, char* target) {
	Node* found_target = bst_search(bst, target);
	if (found_target == NULL) {
		printf("Not found\n");
	}
	else {
		printf("%s\n", found_target->data->name);
		printf("\tCompany: %s\n", found_target->data->company);
		printf("\tAddress: %s\n", found_target->data->address);
		printf("\tZipcode: %s\n", found_target->data->zip);
		printf("\tPhones: %s\n", found_target->data->phone);
		printf("\tEmail: %s\n", found_target->data->email);
	}
}

void add(BST bst, char* name) {
	if (bst_search(bst, name) != NULL) {
		printf("The same name already exists\n");
		return;
	}

	char* company;
	char* address;
	char* zip;
	char* phone;
	char* email;

	char temp_company[100];
	char temp_address[100];
	char temp_zip[100];
	char temp_phone[100];
	char temp_email[100];

	// just passing 100 size of array could cause errors in future process
	// In order to resize them, dynamic arrays are necessary
	printf("Company? ");
	gets(temp_company);
	company = (char*)malloc(strlen(temp_company) + 1);
	strcpy(company, temp_company);

	printf("Address? ");
	gets(temp_address);
	address = (char*)malloc(strlen(temp_address) + 1);
	strcpy(address, temp_address);

	printf("Zipcode? ");
	gets(temp_zip);
	zip = (char*)malloc(strlen(temp_zip) + 1);
	strcpy(zip, temp_zip);

	printf("Phone? ");
	gets(temp_phone);
	phone = (char*)malloc(strlen(temp_phone) + 1);
	strcpy(phone, temp_phone);

	printf("Email? ");
	gets(temp_email);
	int email_len = strlen(temp_email);
	// in order to get enter key
	temp_email[email_len] = '\n';
	temp_email[email_len + 1] = '\0';
	email = (char*)malloc(strlen(temp_email));
	strcpy(email, temp_email);
	insert_node(&bst, name, company, address, zip, phone, email);
}

void trace(BST bst, char* target) {
	if (bst.root == NULL) {
		printf("Not found\n");
		return;
	}

	if (strcmp(bst.root->data->name, target) == 0) {
		printf("%s\n", bst.root->data->name);
		return;
	}

	BST left_subtree;
	BST right_subtree;
	init_bst(&left_subtree);
	init_bst(&right_subtree);
	left_subtree.root = bst.root->left_node;
	right_subtree.root = bst.root->right_node;

	if (strcmp(target, bst.root->data->name) < 0) {
		printf("%s\n", bst.root->data->name);
		return trace(left_subtree, target);
	}
	else {
		printf("%s\n", bst.root->data->name);
		return trace(right_subtree, target);
	}
}

void delete(BST bst, char* target_name) {
	delete_node(&bst, target_name);
}

3. Queue

Queue는 save함수를 위하여 구현되었다.
BST에서 수정된 연락처를 다시 외부 파일로 내보내야하는데, BFS를 이용하여 데이터 하나씩 파일에 써주는 것이 효율적이라 판단하였다.

dequeue 함수에서 데이터를 pop하는 대신, 원본의 형식에 맞춰 스트링에 저장하고, 그 스트링을 리턴해주었다.

add와 마찬가지로 strlen을 이용하는 부분에서 애를 먹었다.

이유는 모르겠지만 (char*)malloc(sizeof(strlen(temp) + 1)와 같이 stlen 함수와 sizeof 함수를 같이 사용하면 코드가 몇 번 진행될 시 오류가 난다. (무조건적이 아닌)

이 때 sizeof를 사용할 필요가 없다.

이것이 다른 함수에 있는 메모리 할당에서 조차 오류를 발생시켰다...

typedef struct ListNode {
	Node* bt_node;
	struct ListNode* next;
}ListNode;

typedef struct Queue {
	struct ListNode* front;
}Queue;

ListNode* new_node(Node* bt_node) {
	ListNode* pnode = (ListNode*)malloc(sizeof(ListNode));

	pnode->bt_node = bt_node;
	pnode->next = NULL;

	return pnode;
}

void init_queue(Queue* queue) {
	queue->front = NULL;
}

void enqueue(Queue* queue, Node* bt_node) {
	ListNode* node = new_node(bt_node);

	if (queue->front == NULL) {
		// if the queue is empty
		queue->front = node;
		return;
	}

	ListNode* current = queue->front;
	while (current->next != NULL) {
		current = current->next;
	}

	current->next = node;
}

char* dequeue(Queue* queue) {
	char temp[500];

	if (queue->front == NULL) {
		printf("Queue is empty.");
		return;
	}

	if (queue->front->next == NULL) {
		sprintf(temp,"%s\t%s\t%s\t%s\t%s\t%s", queue->front->bt_node->data->name, queue->front->bt_node->data->company, queue->front->bt_node->data->address, queue->front->bt_node->data->zip, queue->front->bt_node->data->phone, queue->front->bt_node->data->email);
		free(queue->front);
		queue->front = NULL;
	}

	else {
		struct ListNode* pnext = queue->front->next;
		sprintf(temp, "%s\t%s\t%s\t%s\t%s\t%s", queue->front->bt_node->data->name, queue->front->bt_node->data->company, queue->front->bt_node->data->address, queue->front->bt_node->data->zip, queue->front->bt_node->data->phone, queue->front->bt_node->data->email);
		free(queue->front);

		queue->front = pnext;
	}
	char* s;
	s = (char*)malloc(strlen(temp) + 1);
	strcpy(s, temp);

	return s;
}

4. read and write

read는 연락처를 BST로 읽어오는 함수이다.
write는 save 함수를 말한다.

strtok 함수는 버퍼에 있는 스트링을 토큰화한다.

이 때 데이터의 훼손이 일어나는데, 올바르게 토큰화된 필드를 저장하려면 구조체 배열의 사용을 추천한다.

원래 _strdup으로 값이 복사된 포인터를 각 필드에 해당하는 스트링 포인터에 옮긴 후 insert_node에 매개변수로 넣었다.

그러니 BST 트리가 하나의 값으로만 채워졌다.

save 함수는 Queue를 이용한 BFS를 활용하였다.

BST address_tree;
Data* my_data;
int size;

int get_column_size(char* file_name) {
	char str_tmp[1024];
	FILE* pFile = NULL;
	int size = 0;

	pFile = fopen(file_name, "r");

	if (pFile != NULL)
	{
		while (!feof(pFile)) {
			fgets(str_tmp, 1024, pFile);
			if (strchr(str_tmp, '\n')) {
				size++;
			}
		}
	}
	fclose(pFile);

	return size;
}

void read(char* file_name) {
	size = get_column_size("address_book2020.tsv");
	my_data = (Data*)malloc(sizeof(Data) * size);

	char str_tmp[1024];
	FILE* pFile = NULL;

	pFile = fopen(file_name, "r");
	if (pFile != NULL)
	{
		int idx = 0;

		fgets(str_tmp, 1024, pFile);
		while (idx < size) {
			fgets(str_tmp, 1024, pFile);

			my_data[idx].name = _strdup(strtok(str_tmp, "\t"));
			my_data[idx].company = _strdup(strtok(NULL, "\t"));
			my_data[idx].address = _strdup(strtok(NULL, "\t"));
			my_data[idx].zip = _strdup(strtok(NULL, "\t"));
			my_data[idx].phone = _strdup(strtok(NULL, "\t"));
			my_data[idx].email = _strdup(strtok(NULL, "\t"));
		
			idx++;
		}
	}
	fclose(pFile);

	for (int i = 0; i < size; i++) {
		insert_node(&address_tree, my_data[i].name, my_data[i].company, my_data[i].address, my_data[i].zip, my_data[i].phone, my_data[i].email);
	}
}

void save(BST bst, char* file_name) {
	FILE* pFile = NULL;
	pFile = fopen(file_name, "w+");


	Queue q;
	init_queue(&q);

	enqueue(&q, bst.root);

	ListNode* current;
	current = q.front;

	char* s;

	while (current != NULL) {
		Node* left_node = current->bt_node->left_node;
		Node* right_node = current->bt_node->right_node;

		if (left_node != NULL) {
			enqueue(&q, left_node);
		}

		if (right_node != NULL) {
			enqueue(&q, right_node);
		}
		current = current->next;
		s = dequeue(&q);
		fputs(s, pFile);
	}
	
	fclose(pFile);

	return 0;
}

5. main 함수

int main() {
	init_bst(&address_tree);
	read("address_book2020.tsv");

	//list(address_tree);

	//find(address_tree, "Annelle");

	add(address_tree, "zzz");
	
	/*find(address_tree, "zzz");

	trace(address_tree, "zzz");

	list(address_tree);

	delete(address_tree, "lenna");

	list(address_tree);*/

	save(address_tree,"address_book2.tsv");
}

6. Take Home Message

사람들이 왜 C가 불편하다고 하는지 다시 알게되었다. 그리고 나름의 해결 과정을 적어보았다.

사소한 오류들이 생긴다.

다른 접근 방식을 생각해 보고 예제 파일을 만들어 간단하게 구현해본다.

예제 파일에서도 오류가 발생한다면 구글링 또는 Chat GPT를 사용한다.

디버깅을 꼼꼼하게 하며 수정한다.

함수들이 불친절하다.
- 구글링이나 다큐먼트를 통하여 아주 기초적인 형태로 구현한다.
- 필요한 부분들을 추가한다.

불편함은 C의 특징을 생각하면 당연한 것들이다.
모든 언어에 적용되는 기본 원리이니, 미래에 도움이 되지 않을까?

오동환

게임 개발 공부하고 있어요

이전 포스트

엑셀 (csv) 파일 받아 정렬하기

다음 포스트

외부 파일을 BST에 저장하기

Algorithm

1. BST

2. 함수 구현

3. Queue

4. read and write

5. main 함수

6. Take Home Message

엑셀 (csv) 파일 받아 정렬하기

Red-Black Tree 예쁘다.

0개의 댓글