Python File System 벤치마크

td.junopark·2025년 8월 28일

끄적끄적

목록 보기

3/3

1. 테스트 목적

Python을 통해 여러 스크립트를 작성하다보면, 특정 디렉토리를 재귀적으로 탐색하며 파일을 필터링 해야 하는 경우가 상당히 많이 있다.

과거의 코드를 보면 대부분 os.listdir 메서드와 re 라이브러리를 활용해 작성된 경우가 많은데, 이러한 경우 탐색 시간이 지나치게 오래걸리는 문제가 있었다.

때문에 os.scandir, os.walk 등 다양한 메서드를 벤치마크하는 간단한 스크립트를 작성해보았다.

2. 스크립트 작성

import os
import re
import glob
import time
import shutil
import random
import string
from pathlib import Path

class FolderScanBenchmark:
    def __init__(self, base_path="test_directory", max_depth=100, total_files=100000):
        self.base_path = base_path
        self.max_depth = max_depth
        self.total_files = total_files
        self.file_extensions = ['.txt', '.py', '.js', '.json', '.csv', '.log', '.md', '.xml']
        self.target_extensions = ['.txt', '.py', '.json']
        
    def create_test_structure(self):
        if os.path.exists(self.base_path):
            shutil.rmtree(self.base_path)
        
        print(f"테스트 디렉토리 구조 생성 중... (깊이: {self.max_depth}, 파일: {self.total_files}개)")
        
        dir_paths = [self.base_path]
        current_path = self.base_path
        for d in range(self.max_depth):
            dir_name = f"dir_depth_{d}_{self.generate_random_name(5)}"
            current_path = os.path.join(current_path, dir_name)
            os.makedirs(current_path, exist_ok=True)
            dir_paths.append(current_path)
        
        files_created = 0
        all_dirs = dir_paths.copy()
        while files_created < self.total_files:
            target_dir = random.choice(all_dirs)
            file_ext = random.choice(self.file_extensions)
            file_name = f"file_{files_created}_{self.generate_random_name(8)}{file_ext}"
            file_path = os.path.join(target_dir, file_name)
            with open(file_path, 'w') as f:
                f.write(f"Test file content {files_created}")
            files_created += 1

        print(f"디렉토리 구조 생성 완료: {files_created}개 파일, 최대 깊이 {self.max_depth}")
        return files_created, self.max_depth

    def generate_random_name(self, length):
        return ''.join(random.choices(string.ascii_lowercase, k=length))

    def method1_scandir_re(self):
        """방법 1: os.scandir + re 모듈로 필터링"""
        pattern = re.compile(r'\.(txt|py|json)$', re.IGNORECASE)
        found_files = []
        
        def scan_recursive(path):
            try:
                with os.scandir(path) as entries:
                    for entry in entries:
                        if entry.is_file():
                            if pattern.search(entry.name):
                                found_files.append(entry.path)
                        elif entry.is_dir():
                            scan_recursive(entry.path)
            except (PermissionError, OSError):
                pass
        
        scan_recursive(self.base_path)
        return found_files

    def method2_glob_recursive(self):
        """방법 2: glob.glob with recursive pattern"""
        found_files = []
        for ext in self.target_extensions:
            pattern = os.path.join(self.base_path, "**", f"*{ext}")
            found_files.extend(glob.glob(pattern, recursive=True))
        return found_files

    def method3_pathlib_glob(self):
        """방법 3: pathlib.Path.rglob"""
        found_files = []
        base = Path(self.base_path)
        for ext in self.target_extensions:
            found_files.extend([str(p) for p in base.rglob(f"*{ext}")])
        return found_files

    def method4_walk_filter(self):
        """방법 4: os.walk + 확장자 필터링"""
        found_files = []
        target_exts = set(self.target_extensions)
        
        for root, dirs, files in os.walk(self.base_path):
            for file in files:
                if any(file.lower().endswith(ext) for ext in target_exts):
                    found_files.append(os.path.join(root, file))
        
        return found_files

    def method5_scandir_optimized(self):
        """방법 5: os.scandir 최적화된 버전 (set lookup)"""
        target_exts = set(ext.lower() for ext in self.target_extensions)
        found_files = []
        
        def scan_recursive(path):
            try:
                with os.scandir(path) as entries:
                    for entry in entries:
                        if entry.is_file():
                            file_ext = os.path.splitext(entry.name)[1].lower()
                            if file_ext in target_exts:
                                found_files.append(entry.path)
                        elif entry.is_dir():
                            scan_recursive(entry.path)
            except (PermissionError, OSError):
                pass
        
        scan_recursive(self.base_path)
        return found_files
    
    def method6_listdir_re(self):
        """방법 6: os.listdir + re 모듈로 필터링"""
        pattern = re.compile(r'\.(txt|py|json)$', re.IGNORECASE)
        found_files = []

        def scan_recursive(path):
            try:
                for entry in os.listdir(path):
                    full_path = os.path.join(path, entry)
                    if os.path.isfile(full_path):
                        if pattern.search(entry):
                            found_files.append(full_path)
                    elif os.path.isdir(full_path):
                        scan_recursive(full_path)
            except (PermissionError, OSError):
                pass

        scan_recursive(self.base_path)
        return found_files

    def run_benchmark(self, iterations=3):
        methods = {
            "os.scandir + re": self.method1_scandir_re,
            "glob.glob recursive": self.method2_glob_recursive, 
            "pathlib.rglob": self.method3_pathlib_glob,
            "os.walk + filter": self.method4_walk_filter,
            "os.scandir optimized": self.method5_scandir_optimized,
            "os.listdir + re": self.method6_listdir_re
        }
        
        results = {}
        
        for method_name, method_func in methods.items():
            print(f"\n{method_name} 테스트 중...")
            times = []
            files_found = None
            
            for i in range(iterations):
                start_time = time.perf_counter()
                found_files = method_func()
                end_time = time.perf_counter()
                
                elapsed = end_time - start_time
                times.append(elapsed)
                
                if files_found is None:
                    files_found = len(found_files)
                
                print(f"  반복 {i+1}: {elapsed:.4f}초, {len(found_files)}개 파일")
            
            avg_time = sum(times) / len(times)
            min_time = min(times)
            max_time = max(times)
            
            results[method_name] = {
                'avg_time': avg_time,
                'min_time': min_time,
                'max_time': max_time,
                'files_found': files_found,
                'times': times
            }
        
        return results

    def print_results(self, results):
        print("\n" + "="*80)
        print("벤치마크 결과 요약")
        print("="*80)
        
        sorted_results = sorted(results.items(), key=lambda x: x[1]['avg_time'])
        
        print(f"{'방법':<25} {'평균시간':<10} {'최소시간':<10} {'최대시간':<10} {'찾은파일':<8}")
        print("-" * 80)
        
        for method_name, data in sorted_results:
            print(f"{method_name:<25} {data['avg_time']:.4f}s   {data['min_time']:.4f}s   "
                  f"{data['max_time']:.4f}s   {data['files_found']:>6}")
        
        print("\n상대적 성능 비교:")
        print("-" * 50)
        
        fastest_time = sorted_results[0][1]['avg_time']
        
        for method_name, data in sorted_results:
            ratio = data['avg_time'] / fastest_time
            print(f"{method_name:<25}: {ratio:.2f}x")

    def cleanup(self):
        if os.path.exists(self.base_path):
            shutil.rmtree(self.base_path)
            print(f"\n테스트 디렉토리 '{self.base_path}' 삭제 완료")

def main():
    benchmark = FolderScanBenchmark()
    
    try:
        files_created, max_depth = benchmark.create_test_structure()
        
        print(f"\n벤치마크 시작 - 타겟 확장자: {benchmark.target_extensions}")
        results = benchmark.run_benchmark(iterations=5)
        
        benchmark.print_results(results)
        
        print(f"\n추가 정보:")
        print(f"- 총 파일 수: {files_created}")
        print(f"- 최대 깊이: {max_depth}")
        print(f"- 타겟 확장자: {', '.join(benchmark.target_extensions)}")
        
    finally:
        benchmark.cleanup()

if __name__ == "__main__":
    main()

이 스크립트는 os.scandir + re , glob.glob, pathlib.glob, os.walk + filter, os.scandir (set lookup), os.listdir + re 6가지 경우를 벤치마킹하도록 작성되었다.
각 메서드로 5회 반복 테스트하여 평균치로 속도를 비교했다.
추가로, 디렉토리 깊이는 100, 총 파일 수는 100,000으로 설정 후 비교했다.

3. 테스트 환경

OS: Windows 10
CPU: AMD Ryzen 9 9900X 12-Core
RAM: 128GB

4. 테스트 결과

테스트 디렉토리 구조 생성 중... (깊이: 100, 파일: 100000개)
디렉토리 구조 생성 완료: 100000개 파일, 최대 깊이 100

벤치마크 시작 - 타겟 확장자: ['.txt', '.py', '.json']

os.scandir + re 테스트 중...
  반복 1: 0.3101초, 37282개 파일
  반복 2: 0.2960초, 37282개 파일
  반복 3: 0.2844초, 37282개 파일
  반복 4: 0.2887초, 37282개 파일
  반복 5: 0.2825초, 37282개 파일

glob.glob recursive 테스트 중...
  반복 1: 1.5839초, 37282개 파일
  반복 2: 1.6048초, 37282개 파일
  반복 3: 1.5780초, 37282개 파일
  반복 4: 1.5927초, 37282개 파일
  반복 5: 1.5675초, 37282개 파일

pathlib.rglob 테스트 중...
  반복 1: 1.9131초, 37282개 파일
  반복 2: 1.8693초, 37282개 파일
  반복 3: 1.8965초, 37282개 파일
  반복 4: 1.9497초, 37282개 파일
  반복 5: 1.9430초, 37282개 파일

os.walk + filter 테스트 중...
  반복 1: 0.3879초, 37282개 파일
  반복 2: 0.3843초, 37282개 파일
  반복 3: 0.3589초, 37282개 파일
  반복 4: 0.3991초, 37282개 파일
  반복 5: 0.3971초, 37282개 파일

os.scandir optimized 테스트 중...
  반복 1: 0.3413초, 37282개 파일
  반복 2: 0.3708초, 37282개 파일
  반복 3: 0.3340초, 37282개 파일
  반복 4: 0.3543초, 37282개 파일
  반복 5: 0.3554초, 37282개 파일

os.listdir + re 테스트 중...
  반복 1: 2.1937초, 37282개 파일
  반복 2: 2.1917초, 37282개 파일
  반복 3: 2.3004초, 37282개 파일
  반복 4: 2.2001초, 37282개 파일
  반복 5: 2.2650초, 37282개 파일

================================================================================     
벤치마크 결과 요약
================================================================================     
방법                        평균시간       최소시간       최대시간       찾은파일    
--------------------------------------------------------------------------------     
os.scandir + re           0.2923s   0.2825s   0.3101s    37282
os.scandir optimized      0.3511s   0.3340s   0.3708s    37282
os.walk + filter          0.3855s   0.3589s   0.3991s    37282
glob.glob recursive       1.5854s   1.5675s   1.6048s    37282
pathlib.rglob             1.9143s   1.8693s   1.9497s    37282
os.listdir + re           2.2302s   2.1917s   2.3004s    37282

상대적 성능 비교:
--------------------------------------------------
os.scandir + re          : 1.00x
os.scandir optimized     : 1.20x
os.walk + filter         : 1.32x
glob.glob recursive      : 5.42x
pathlib.rglob            : 6.55x
os.listdir + re          : 7.63x

추가 정보:
- 총 파일 수: 100000
- 최대 깊이: 100
- 타겟 확장자: .txt, .py, .json

테스트 디렉토리 'test_directory' 삭제 완료

테스트 결과, os.scandir (set lookup) 방식이 가장 빨랐다.

확실히 os.listdir의 경우 오버헤드가 큰 오래된 메서드다 보니 속도가 확연히 느린 것을 알 수 있다.
추가로, 조건을 변경하여 테스트 해본 결과 최대 깊이와 파일 수가 늘어날수록 os.scandir + re가 우세했다.

5. 결론

양이 많지 않은 디렉토리를 탐색하는 경우엔 os.listdir 메서드를 사용하면 되겠으나,
테스트 결과 디렉토리 깊이가 1이고 파일 수가 100,000인 경우에도 여전히 os.listdir가 가장 오래 걸리는 것을 확인했다.

Python을 다루는 다양한 커뮤니티에서도 os.listdir은 과거의 유산이라고 할 정도이니 되도록 사용하지 않는 것이 좋을 것 같다.

추가로, os.scandir의 경우 is_file()이나 is_dir()처럼 파일 엔트리에 접근할 때 추가적인 오버헤드가 발생하지 않기 때문에 일반적인 환경에서 성능이 훨씬 뛰어나다고 볼 수 있겠다.

td.junopark

공부 중🙄

이전 포스트