[SpringBoot] 대용량 데이터 CSV 파일을 DB에 Insert하는 방법

Ming·2022년 11월 2일
2

GitHub hospital-api
100,000건이 넘는 병원정보 csv 파일을 가져와 DB에 저장하는 프로젝트이다.

과정
1. csv 파일 불러와서 parsing
2. parsing한 데이터 DB insert

CSV 파일 불러와서 Parsing

ReadLineContext.java
파일에서 한줄씩 읽어와서 parse 메서드를 통해 파싱을 한 결과를 result 리스트에 추가한다.

package com.springboot.hospitalsearchapi.parser;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class ReadLineContext<T> {
    Parser<T> parser;

    public ReadLineContext(Parser<T> parser) {
        this.parser = parser;
    }

    public List<T> readByLine(String filename) throws IOException {
        // 삽
        List<T> result = new ArrayList<>();
        BufferedReader reader = new BufferedReader(
                new FileReader(filename)
        );
        String str;
        while ((str = reader.readLine()) != null) {
            try {
                result.add(parser.parse(str));
            }catch(Exception e){
                System.out.printf("파싱 중 문제가 생겨 이 라인은 넘어갑니다. 파일내용 : %s\n", str.substring(0, 20));
            }
        }
        reader.close();
        return result;
    }

}

Parser.java
Parser 인터페이스이다.

package com.springboot.hospitalsearchapi.parser;

public interface Parser<T> {
    T parse(String str);
}

ParserFactory.java

package com.springboot.hospitalsearchapi.parser;

import com.springboot.hospitalsearchapi.domain.Hospital;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;

@Configuration
public class ParserFactory {
    @Bean
    public ReadLineContext<Hospital> hospitalReadLineContextest(){
        return new ReadLineContext<Hospital>(new HospitalParser());
    }
}

HospitalParser.java
병원정보 한줄을 파싱하고 Hospital 객체를 생성해서 반환한다.

package com.springboot.hospitalsearchapi.parser;

import com.springboot.hospitalsearchapi.domain.Hospital;

import java.time.LocalDate;
import java.time.LocalDateTime;

public class HospitalParser implements Parser<Hospital>{

    @Override
    public Hospital parse(String str) {
        String[] splitted = str.split("\",\"");
        Hospital hospital = new Hospital();
        hospital.setId(Integer.parseInt(splitted[0].replace("\"","")));
        hospital.setOpenServiceName(splitted[1]);
        hospital.setOpenLocalGovernmentCode(Integer.parseInt(splitted[3]));
        hospital.setManagementNumber(splitted[4]);

        int year=Integer.parseInt(splitted[5].substring(0, 4));
        int month=Integer.parseInt(splitted[5].substring(4, 6));
        int day=Integer.parseInt(splitted[5].substring(6, 8));
        hospital.setLicenseDate(LocalDateTime.of(year, month, day, 0, 0, 0));

        hospital.setBusinessStatus(Integer.parseInt(splitted[7]));
        hospital.setBusinessStatusCode(Integer.parseInt(splitted[9]));
        hospital.setPhone(splitted[15]);
        hospital.setFullAddress(splitted[18]);
        hospital.setRoadNameAddress(splitted[19]);
        hospital.setHospitalName(splitted[21]);
        hospital.setBusinessTypeName(splitted[25]);
        hospital.setHealthcareProviderCount(Integer.parseInt(splitted[29]));
        hospital.setPatientRoomCount(Integer.parseInt(splitted[30]));
        hospital.setTotalNumberOfBeds(Integer.parseInt(splitted[31]));
        hospital.setTotalAreaSize(Float.parseFloat(splitted[32]));
        return hospital;
    }
}

Parsing한 데이터 DB insert

데이터가 십만건이상이라 많은 시간이 걸린다.
HospitalDao.java
JdbcTemplate를 이용해 DB에 Insert하고 Delete해주는 메서드를 구현했다.

package com.springboot.hospitalsearchapi.dao;

import com.springboot.hospitalsearchapi.domain.Hospital;
import lombok.extern.slf4j.Slf4j;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.ComponentScan;
import org.springframework.context.annotation.Configuration;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.jdbc.core.RowMapper;
import org.springframework.stereotype.Component;

import javax.sql.DataSource;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;

@Component
@Slf4j
public class HospitalDao {
    private final DataSource dataSource;
    private final JdbcTemplate jdbcTemplate;

    public HospitalDao(DataSource dataSource, JdbcTemplate jdbcTemplate){
        this.dataSource=dataSource;
        this.jdbcTemplate=jdbcTemplate;
    }

    public void add(final Hospital hospital){
        String sql = "Insert into nation_wide_hospitals values " +
                "(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)";
        this.jdbcTemplate.update(sql, hospital.getId(), hospital.getOpenServiceName(),
                hospital.getOpenLocalGovernmentCode(), hospital.getManagementNumber(),
                hospital.getLicenseDate(), hospital.getBusinessStatus(),
                hospital.getBusinessStatusCode(), hospital.getPhone(),
                hospital.getFullAddress(), hospital.getRoadNameAddress(),
                hospital.getHospitalName(), hospital.getBusinessTypeName(),
                hospital.getHealthcareProviderCount(), hospital.getPatientRoomCount(),
                hospital.getTotalNumberOfBeds(), hospital.getTotalAreaSize());
    }
    
    public int deleteAll(){
        return this.jdbcTemplate.update("delete from nation_wide_hospitals");
    }
    
    public int getCount(){
        return this.jdbcTemplate.queryForObject(
                "select count(*) from nation_wide_hospitals", Integer.class);
    }
}

HospitalParserTest.java
데이터가 잘 들어가는지 Test하는 코드입니다.

package com.springboot.hospitalsearchapi.parser;

import com.springboot.hospitalsearchapi.dao.HospitalDao;
import com.springboot.hospitalsearchapi.domain.Hospital;
import com.springboot.hospitalsearchapi.service.HospitalService;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.IOException;
import java.time.LocalDateTime;
import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

@SpringBootTest
class HospitalParserTest {

    @Autowired
    ReadLineContext<Hospital> hospitalReadLineContext;

    @Autowired
    HospitalDao hospitalDao;

    @Autowired
    HospitalService hospitalService;
    
    @Test
    @DisplayName("병원정보 전체 add 잘되는지")
    void addAllTest() throws IOException {
        String filename="/Users/minji/Documents/likelion/file/fulldata_01_01_02_P_의원.csv";
        hospitalDao.deleteAll();
        int cnt=hospitalService.insertLargeVolumHospitalData(filename);
        assertEquals(cnt, hospitalDao.getCount());
    }
    
    @Test
    @DisplayName("Hospital deleteAll, getCount 잘 되는지")
    void deleteAllTest(){
        hospitalDao.deleteAll();
        assertEquals(0, hospitalDao.getCount());
    }

}

병렬처리

데이터가 다 들어가는데 한시간 걸리던 작업이 병렬처리를 하고 나서는 10분 이내로 끝난다.
HospitalService.java 추가

package com.springboot.hospitalsearchapi.service;

import com.springboot.hospitalsearchapi.dao.HospitalDao;
import com.springboot.hospitalsearchapi.domain.Hospital;
import com.springboot.hospitalsearchapi.parser.ReadLineContext;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.io.IOException;
import java.util.List;
import java.util.Optional;

@Service
public class HospitalService {

    private final ReadLineContext<Hospital> hospitalReadLineContext;

    private final HospitalDao hospitalDao;

    public HospitalService(ReadLineContext<Hospital> hospitalReadLineContext, HospitalDao hospitalDao) {
        this.hospitalReadLineContext = hospitalReadLineContext;
        this.hospitalDao = hospitalDao;
    }

    @Transactional
    public int insertLargeVolumeHospitalData(String filename) {
        List<Hospital> hospitalList;
        try {
            hospitalList = hospitalReadLineContext.readByLine(filename);
            System.out.println("파싱이 끝났습니다.");
            hospitalList.stream()
                    .parallel()
                    .forEach(hospital -> {
                        try {
                            this.hospitalDao.add(hospital); // db에 insert하는 구간
                        } catch (Exception e) {
                            System.out.printf("id:%d 레코드에 문제가 있습니다.\n",hospital.getId());
                            throw new RuntimeException(e);
                        }
                    });
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        if (!Optional.of(hospitalList).isEmpty()) {
            return hospitalList.size();
        } else {
            return 0;
        }
    }
}

HospitalParserTest.java 수정

package com.springboot.hospitalsearchapi.parser;

import com.springboot.hospitalsearchapi.dao.HospitalDao;
import com.springboot.hospitalsearchapi.domain.Hospital;
import com.springboot.hospitalsearchapi.service.HospitalService;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.DisplayName;
import org.junit.jupiter.api.Test;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.context.SpringBootTest;

import java.io.IOException;
import java.time.LocalDateTime;
import java.util.List;

import static org.junit.jupiter.api.Assertions.*;

@SpringBootTest
class HospitalParserTest {

    @Autowired
    ReadLineContext<Hospital> hospitalReadLineContext;

    @Autowired
    HospitalDao hospitalDao;

    @Autowired
    HospitalService hospitalService;

    @Test
    @DisplayName("병원정보 전체 add 잘되는지")
    void addAllTest() throws IOException {
        String filename="/Users/minji/Documents/likelion/file/fulldata_01_01_02_P_의원.csv";
        hospitalDao.deleteAll();
        int cnt=hospitalService.insertLargeVolumHospitalData(filename);
        assertEquals(cnt, hospitalDao.getCount());
    }
    
    @Test
    @DisplayName("Hospital deleteAll, getCount 잘 되는지")
    void deleteAllTest(){
        hospitalDao.deleteAll();
        assertEquals(0, hospitalDao.getCount());
    }

}

0개의 댓글