import os import re import fitz # PyMuPDF from collections import Counter # 시험지 파일이 있는 디렉토리 경로 TARGET_DIR = r'C:\Users\User\Downloads\시험지 다운' # 인식할 세부 과목 목록 KOREAN_SUBJECTS = ['언어와 매체', '화법과 작문'] SCIENCE_SUBJECTS = ['물리학Ⅰ', '화학Ⅰ', '생명과학Ⅰ', '지구과학Ⅰ', '물리학Ⅱ', '화학Ⅱ', '생명과학Ⅱ', '지구과학Ⅱ', '통합과학'] SOCIAL_SUBJECTS = ['생활과 윤리', '윤리와 사상', '한국지리', '세계지리', '동아시아사', '세계사', '정치와 법', '경제', '사회·문화', '통합사회'] FOREIGN_LANG_SUBJECTS = ['독일어Ⅰ', '프랑스어Ⅰ', '스페인어Ⅰ', '중국어Ⅰ', '일본어Ⅰ', '러시아어Ⅰ', '아랍어Ⅰ', '베트남어Ⅰ', '한문Ⅰ'] # --- Helper Functions --- def analyze_pdf(pdf_path): """fitz를 사용하여 PDF 상단 영역과 전체 텍스트를 반환한다.""" try: with fitz.open(pdf_path) as doc: if not doc or len(doc) == 0: return "", "" page = doc.load_page(0) full_text = page.get_text("text", sort=True) or "" top_section_height = 120 * (72 / 25.4) clip_rect = fitz.Rect(0, 0, page.rect.width, min(top_section_height, page.rect.height)) header_text = page.get_text("text", clip=clip_rect, sort=True) or "" return header_text, full_text except Exception as e: print(f" -> ERROR: Reading {os.path.basename(pdf_path)} failed. {e}") return "", "" def normalize_text(text): """비교를 위해 텍스트의 공백과 모든 종류의 점을 제거한다.""" return re.sub(r'[\s·ㆍ∙・]', '', text) # --- 수정된 부분 1 --- # context_base_info 인자를 추가하여 외부에서 시험정보를 주입받을 수 있도록 함 def create_new_filename(text_to_parse, context_base_info=None): """주어진 텍스트와 외부 context를 이용해 새 파일명과 공통정보(base_info)를 생성한다.""" if not text_to_parse: return None, None # 텍스트에서 시험정보 추출 시도 year_month_match = re.search(r'(\d{4}학년도\s+\d{1,2}월)', text_to_parse) grade_match = re.search(r'(고[1-3])', text_to_parse) exam_name_match = re.search(r'(전국연합학력평가)', text_to_parse) base_info = None if all([year_month_match, grade_match, exam_name_match]): base_info = f"{year_month_match.group(1)} {grade_match.group(1)} {exam_name_match.group(1)}".strip() # 텍스트에 정보가 없다면, 외부에서 주입받은 context_base_info를 사용 elif context_base_info: base_info = context_base_info # 둘 다 없으면 파일명 생성을 위한 기본 정보가 없으므로 포기 else: return None, None # --- 수정된 부분 2 --- # '정답 및 해설'을 '대본'보다 먼저 체크하도록 순서 변경 subject, file_type = None, '문제지' if '정답' in text_to_parse and '해설' in text_to_parse: file_type = '해설' elif '대본' in text_to_parse and '영어' in text_to_parse: subject, file_type = '영어 영역', '대본' if not subject: normalized_parser = normalize_text(text_to_parse) simple_subjects = {'수학영역': '수학 영역', '영어영역': '영어 영역', '한국사영역': '한국사 영역'} for key, value in simple_subjects.items(): if key in normalized_parser: subject = value; break if not subject: subject_to_category_map = {s: cat for s_list, cat in [(KOREAN_SUBJECTS, "국어 영역"), (SCIENCE_SUBJECTS, "과학탐구 영역"), (SOCIAL_SUBJECTS, "사회탐구 영역"), (FOREIGN_LANG_SUBJECTS, "제2외국어/한문 영역")] for s in s_list} found_s, found_cat = None, None for s, category in subject_to_category_map.items(): if normalize_text(s) in normalized_parser: context_keyword = '탐구영역' if '탐구' in category else '국어영역' if '국어' in category else '제2외국어/한문영역' if '제2외국어' in category else None if context_keyword and context_keyword in normalized_parser: found_s, found_cat = s, category break if found_s: subject = f"{found_cat}({found_s})" elif '국어영역' in normalized_parser: subject = "국어 영역" if not subject and file_type == '해설': print(" -> Context search failed for answer sheet, trying lenient search...") for s, category in subject_to_category_map.items(): if normalize_text(s) in normalized_parser: subject = f"{category}({s})"; break if not subject: return None, None subject_for_filename = subject.replace('/', '_') filename_map = {'대본': f"{base_info} {subject_for_filename}_대본.pdf", '해설': f"{base_info} {subject_for_filename}_정답 및 해설.pdf", '문제지': f"{base_info} {subject_for_filename}_문제지.pdf"} # 정규식으로 한번 더 정리하여 유효하지 않은 문자 제거 return re.sub(r'[\\*?:"<>|]', "", filename_map[file_type]), base_info def get_folder_prefix_and_grade(directory): """폴더의 파일들을 스캔하여 가장 흔한 '학년도 월'과 '학년'을 찾아 반환한다.""" prefixes, grades = [], [] for dirpath, _, filenames in os.walk(directory): for f in filenames: if f.startswith('20'): prefix_match = re.match(r'(\d{4}학년도\s+\d{1,2}월)', f) grade_match = re.search(r'(고[1-3])', f) if prefix_match: prefixes.append(prefix_match.group(1)) if grade_match: grades.append(grade_match.group(1)) prefix = Counter(prefixes).most_common(1)[0][0] if prefixes else "" grade = Counter(grades).most_common(1)[0][0] if grades else "" return prefix, grade def parse_filename(filename): """변경된 최종 파일명에서 정보를 파싱하여 딕셔너리로 반환""" info = {'grade': None, 'subject': None, 'type': None} grade_match = re.search(r'(고[1-3])', filename) if grade_match: info['grade'] = grade_match.group(1) match = re.search(r'전국연합학력평가\s(.+)_(정답 및 해설|문제지|대본|듣기)\.(pdf|mp3)$', filename) if match: info['subject'] = match.group(1).strip() type_str = match.group(2) if type_str == '정답 및 해설': info['type'] = '해설' else: info['type'] = type_str return info def organize_files_final(directory, folder_prefix, grade_str): """정리 단계: 모든 하위폴더를 포함하여 인벤토리 생성 후, 규칙에 따라 파일 이동""" print(f"\n--- Phase 2: Organizing all files ---") inventory = [] for dirpath, _, filenames in os.walk(directory): for f in filenames: if f.startswith('20'): inventory.append({'path': os.path.join(dirpath, f), **parse_filename(f)}) has_korean_selectives = any(item['subject'] and ('언어와 매체' in item['subject'] or '화법과 작문' in item['subject']) for item in inventory) social_specifics = {re.search(r'\((.+)\)', item['subject']).group(1) for item in inventory if item['subject'] and '사회탐구' in item['subject'] and re.search(r'\((.+)\)', item['subject'])} science_specifics = {re.search(r'\((.+)\)', item['subject']).group(1) for item in inventory if item['subject'] and '과학탐구' in item['subject'] and re.search(r'\((.+)\)', item['subject'])} create_social_folder = not (social_specifics == {'통합사회'} or not social_specifics) create_science_folder = not (science_specifics == {'통합과학'} or not science_specifics) if has_korean_selectives: print("Korean selective subjects found. Applying special rule for '국어'.") if not create_social_folder and social_specifics: print("Only '통합사회' found. Social studies files will remain in their current directory.") if not create_science_folder and science_specifics: print("Only '통합과학' found. Science files will remain in their current directory.") for item in inventory: base_folder_name = None if item.get('type') == '해설': base_folder_name = "정답 및 해설" elif item.get('subject'): subject = item['subject'] if '국어' in subject: if has_korean_selectives: base_folder_name = "국어" elif '영어' in subject: base_folder_name = "영어(문제_듣기_대본)" elif '사회탐구' in subject: if create_social_folder: base_folder_name = "사회탐구" elif '과학탐구' in subject: if create_science_folder: base_folder_name = "과학탐구" elif '제2외국어' in subject: base_folder_name = "제2외국어_한문" if base_folder_name: full_folder_name = f"{folder_prefix} {grade_str} {base_folder_name}".strip() folder_path = os.path.join(directory, full_folder_name) os.makedirs(folder_path, exist_ok=True) destination_path = os.path.join(folder_path, os.path.basename(item['path'])) if os.path.abspath(item['path']) != os.path.abspath(destination_path): try: os.rename(item['path'], destination_path) print(f" -> Moved '{os.path.basename(item['path'])}' to '{full_folder_name}' folder.") except OSError as e: print(f" -> ERROR: Could not move '{os.path.basename(item['path'])}'. {e}") def check_for_missing_pairs(directory): """모든 폴더를 스캔하여 문제지와 해설지 쌍이 맞는지 확인하고 보고한다.""" print("\n--- Phase 3: Verifying Pairs ---") question_papers, answer_sheets = set(), set() for dirpath, _, filenames in os.walk(directory): for filename in filenames: if not filename.startswith('20') or not filename.lower().endswith('.pdf'): continue match = re.search(r'전국연합학력평가\s(.+)_(?:문제지|정답 및 해설)\.pdf$', filename) if not match: continue subject_part = match.group(1).strip() if '문제지' in filename: question_papers.add(subject_part) elif '정답 및 해설' in filename: answer_sheets.add(subject_part) missing_answers = sorted(list(question_papers - answer_sheets)) missing_questions = sorted(list(answer_sheets - question_papers)) if not missing_answers and not missing_questions: print("Verification complete: All question papers and answer sheets are paired correctly.") else: if missing_answers: print("\n[!] Files with MISSING ANSWER SHEETS:"); [print(f" - {s}") for s in missing_answers] if missing_questions: print("\n[!] Files with MISSING QUESTION PAPERS:"); [print(f" - {s}") for s in missing_questions] # --- Main Controller Function --- def process_directory(directory): if not os.path.isdir(directory): print(f"ERROR: Directory not found -> {directory}"); return exam_context_base_info = None # 제일 먼저 전체 디렉토리를 훑어서 유효한 파일로부터 기준 정보를 찾아냄 for filename in os.listdir(directory): if not filename.lower().endswith('.pdf'): continue # --- 수정된 부분 3 --- # create_new_filename 호출 시 context를 넘겨주도록 수정 # 초기 탐색에서는 context가 없으므로 None이 전달됨 _, base_info = create_new_filename(analyze_pdf(os.path.join(directory, filename))[0]) if base_info: exam_context_base_info = base_info print(f"Context found: '{exam_context_base_info}'") break print(f"\n--- Phase 1: Renaming original files ---") script_files_to_defer = [] for filename in os.listdir(directory): if not filename.lower().endswith('.pdf') or (filename.startswith('20') and re.search(r'_(?:문제지|정답 및 해설|대본)\.pdf$', filename)): continue pdf_path = os.path.join(directory, filename) print(f"Analyzing: {filename}") header_text, full_page_text = analyze_pdf(pdf_path) # --- 수정된 부분 4 --- # 이제 create_new_filename 호출 시 항상 찾아둔 context를 전달함 new_filename, base_info = create_new_filename(header_text, context_base_info=exam_context_base_info) if not new_filename: new_filename, base_info = create_new_filename(full_page_text, context_base_info=exam_context_base_info) if new_filename and new_filename != filename: try: os.rename(pdf_path, os.path.join(directory, new_filename)) print(f" -> Renamed to: {new_filename}") # 이 로직은 이제 최초 탐색에서 처리되므로 중복이지만, 안전을 위해 놔둠 if not exam_context_base_info and base_info: exam_context_base_info = base_info print(f" -> Context found: '{exam_context_base_info}'") except OSError as e: print(f" -> ERROR: Renaming failed. {e}") elif '대본' in full_page_text and '영어' in full_page_text: print(" -> Script file found. Deferring renaming.") script_files_to_defer.append(pdf_path) else: print(f" -> NOTICE: Could not determine a new name for '{filename}'. Skipping.") if script_files_to_defer: print("\n--- Processing deferred script files ---") if exam_context_base_info: for pdf_path in script_files_to_defer: new_filename = f"{exam_context_base_info} 영어 영역_대본.pdf" try: os.rename(pdf_path, os.path.join(directory, new_filename)) print(f" -> Renamed deferred '{os.path.basename(pdf_path)}' to: {new_filename}") except OSError as e: print(f" -> ERROR: Renaming deferred file failed. {e}") else: print(" -> ERROR: Could not rename script file, no context found from any other files.") print(f"\nRenaming phase complete.") folder_prefix, grade_str = get_folder_prefix_and_grade(directory) if folder_prefix: print(f"\nDetected common folder prefix: '{folder_prefix}' and grade: '{grade_str}'") exam_base_name = f"{folder_prefix} {grade_str} 전국연합학력평가".strip() if folder_prefix and grade_str else None if exam_base_name: for filename in os.listdir(directory): if filename.lower().endswith('.mp3') and not filename.startswith('20'): new_mp3_name = f"{exam_base_name} 영어 영역_듣기.mp3" try: os.rename(os.path.join(directory, filename), os.path.join(directory, new_mp3_name)); print(f"Renamed MP3 to: {new_mp3_name}") except OSError: pass organize_files_final(directory, folder_prefix, grade_str) check_for_missing_pairs(directory) print("\n--- All tasks finished. ---") if __name__ == "__main__": # 디버깅 편의를 위해 print문은 그대로 둠 process_directory(TARGET_DIR)