diff --git a/src/tagstudio/core/library/alchemy/library.py b/src/tagstudio/core/library/alchemy/library.py index 1ce4fc85f..fb1b900a6 100644 --- a/src/tagstudio/core/library/alchemy/library.py +++ b/src/tagstudio/core/library/alchemy/library.py @@ -1006,6 +1006,10 @@ def has_path_entry(self, path: Path) -> bool: with Session(self.engine) as session: return session.query(exists().where(Entry.path == path)).scalar() + def all_paths(self) -> Iterable[tuple[int, Path]]: + with Session(self.engine) as session: + return ((i, p) for i, p in session.execute(select(Entry.id, Entry.path)).all()) + def get_paths(self, limit: int = -1) -> list[str]: path_strings: list[str] = [] with Session(self.engine) as session: diff --git a/src/tagstudio/core/library/alchemy/registries/unlinked_registry.py b/src/tagstudio/core/library/alchemy/registries/unlinked_registry.py deleted file mode 100644 index 8058df85f..000000000 --- a/src/tagstudio/core/library/alchemy/registries/unlinked_registry.py +++ /dev/null @@ -1,94 +0,0 @@ -from collections.abc import Iterator -from dataclasses import dataclass, field -from pathlib import Path - -import structlog -from wcmatch import pathlib - -from tagstudio.core.library.alchemy.library import Library -from tagstudio.core.library.alchemy.models import Entry -from tagstudio.core.library.ignore import PATH_GLOB_FLAGS, Ignore, ignore_to_glob -from tagstudio.core.utils.types import unwrap - -logger = structlog.get_logger() - - -@dataclass -class UnlinkedRegistry: - """State tracker for unlinked entries.""" - - lib: Library - files_fixed_count: int = 0 - unlinked_entries: list[Entry] = field(default_factory=list) - - @property - def unlinked_entries_count(self) -> int: - return len(self.unlinked_entries) - - def reset(self): - self.unlinked_entries.clear() - - def refresh_unlinked_files(self) -> Iterator[int]: - """Track the number of entries that point to an invalid filepath.""" - logger.info("[UnlinkedRegistry] Refreshing unlinked files...") - - self.unlinked_entries = [] - for i, entry in enumerate(self.lib.all_entries()): - full_path = unwrap(self.lib.library_dir) / entry.path - if not full_path.exists() or not full_path.is_file(): - self.unlinked_entries.append(entry) - yield i - - def match_unlinked_file_entry(self, match_entry: Entry) -> list[Path]: - """Try and match unlinked file entries with matching results in the library directory. - - Works if files were just moved to different subfolders and don't have duplicate names. - """ - library_dir = unwrap(self.lib.library_dir) - matches: list[Path] = [] - - # NOTE: ignore_to_glob() is needed for wcmatch, not ripgrep. - ignore_patterns = ignore_to_glob(Ignore.get_patterns(library_dir)) - for path in pathlib.Path(str(library_dir)).glob( - f"***/{match_entry.path.name}", - flags=PATH_GLOB_FLAGS, - exclude=ignore_patterns, - ): - if path.is_dir(): - continue - if path.name == match_entry.path.name: - new_path = Path(path).relative_to(library_dir) - matches.append(new_path) - - logger.info("[UnlinkedRegistry] Matches", matches=matches) - return matches - - def fix_unlinked_entries(self) -> Iterator[int]: - """Attempt to fix unlinked file entries by finding a match in the library directory.""" - self.files_fixed_count = 0 - matched_entries: list[Entry] = [] - for i, entry in enumerate(self.unlinked_entries): - item_matches = self.match_unlinked_file_entry(entry) - if len(item_matches) == 1: - logger.info( - "[UnlinkedRegistry]", - entry=entry.path.as_posix(), - item_matches=item_matches[0].as_posix(), - ) - if not self.lib.update_entry_path(entry.id, item_matches[0]): - try: - match = unwrap(self.lib.get_entry_full_by_path(item_matches[0])) - entry_full = unwrap(self.lib.get_entry_full(entry.id)) - self.lib.merge_entries(entry_full, match) - except AttributeError: - continue - self.files_fixed_count += 1 - matched_entries.append(entry) - yield i - - for entry in matched_entries: - self.unlinked_entries.remove(entry) - - def remove_unlinked_entries(self) -> None: - self.lib.remove_entries(list(map(lambda unlinked: unlinked.id, self.unlinked_entries))) - self.unlinked_entries = [] diff --git a/src/tagstudio/core/library/ignore.py b/src/tagstudio/core/library/ignore.py index e3eda7ed1..d4da249af 100644 --- a/src/tagstudio/core/library/ignore.py +++ b/src/tagstudio/core/library/ignore.py @@ -14,7 +14,7 @@ logger = structlog.get_logger() -PATH_GLOB_FLAGS = glob.GLOBSTARLONG | glob.DOTGLOB | glob.NEGATE | pathlib.MATCHBASE +PATH_GLOB_FLAGS = glob.GLOBSTARLONG | glob.DOTGLOB | glob.NEGATE | pathlib.MATCHBASE | pathlib.NODIR GLOBAL_IGNORE = [ diff --git a/src/tagstudio/core/library/refresh.py b/src/tagstudio/core/library/refresh.py index 1b2115cdd..cf6fdda6f 100644 --- a/src/tagstudio/core/library/refresh.py +++ b/src/tagstudio/core/library/refresh.py @@ -4,6 +4,7 @@ import shutil +import sys from collections.abc import Iterator from dataclasses import dataclass, field from datetime import datetime as dt @@ -11,7 +12,7 @@ from time import time import structlog -from wcmatch import pathlib +from wcmatch import glob from tagstudio.core.library.alchemy.library import Library from tagstudio.core.library.alchemy.models import Entry @@ -25,20 +26,37 @@ @dataclass class RefreshTracker: library: Library - files_not_in_library: list[Path] = field(default_factory=list) + + _paths_to_id: dict[str, int] = field(default_factory=dict) + _expected_paths: set[str] = field(default_factory=set) + + _missing_paths: dict[str, int] = field(default_factory=dict) + _new_paths: list[Path] = field(default_factory=list) @property - def files_count(self) -> int: - return len(self.files_not_in_library) + def missing_files_count(self) -> int: + return len(self._missing_paths) + + @property + def new_files_count(self) -> int: + return len(self._new_paths) + + def _add_path(self, entry_id: int, path: str): + self._paths_to_id[path] = entry_id + self._expected_paths.add(path) + + def _del_path(self, path: str): + self._paths_to_id.pop(path) + self._expected_paths.remove(path) def save_new_files(self) -> Iterator[int]: """Save the list of files that are not in the library.""" batch_size = 200 index = 0 - while index < len(self.files_not_in_library): + while index < len(self._new_paths): yield index - end = min(len(self.files_not_in_library), index + batch_size) + end = min(len(self._new_paths), index + batch_size) entries = [ Entry( path=entry_path, @@ -46,11 +64,50 @@ def save_new_files(self) -> Iterator[int]: fields=[], date_added=dt.now(), ) - for entry_path in self.files_not_in_library[index:end] + for entry_path in self._new_paths[index:end] ] - self.library.add_entries(entries) + entry_ids = self.library.add_entries(entries) index = end - self.files_not_in_library = [] + + for i in range(len(entries)): + id = entry_ids[i] + path = str(entries[i].path) + self._add_path(id, path) + + self._new_paths.clear() + + def fix_unlinked_entries(self): + """Attempt to fix unlinked file entries by finding a match in the library directory.""" + new_paths: dict[str, list[Path]] = {} + for path in self._new_paths: + path = Path(path) + new_paths.setdefault(path.name, []).append(path) + + fixed: list[str] = [] + for ( + path, + entry_id, + ) in self._missing_paths.items(): + name = Path(path).name + if name not in new_paths or len(new_paths[name]) != 1: + continue + new_path = new_paths.pop(name)[0] + if self.library.update_entry_path(entry_id, new_path): + self._del_path(path) + self._add_path(entry_id, str(new_path)) + fixed.append(path) + + for path in fixed: + self._missing_paths.pop(path) + + def remove_unlinked_entries(self) -> None: + to_remove = [] + for path, id in self._missing_paths.items(): + to_remove.append(id) + self._del_path(path) + self._missing_paths.clear() + + self.library.remove_entries(to_remove) def refresh_dir(self, library_dir: Path, force_internal_tools: bool = False) -> Iterator[int]: """Scan a directory for files, and add those relative filenames to internal variables. @@ -63,20 +120,27 @@ def refresh_dir(self, library_dir: Path, force_internal_tools: bool = False) -> if self.library.library_dir is None: raise ValueError("No library directory set.") - ignore_patterns = Ignore.get_patterns(library_dir) + start_time = time() + self._paths_to_id = dict((str(p), i) for i, p in self.library.all_paths()) + self._expected_paths = set(self._paths_to_id.keys()) + logger.info( + "[Refresh]: Fetch entry paths", + duration=(time() - start_time), + ) - if force_internal_tools: - return self.__wc_add(library_dir, ignore_to_glob(ignore_patterns)) + ignore_patterns = Ignore.get_patterns(library_dir) - dir_list: list[str] | None = self.__get_dir_list(library_dir, ignore_patterns) + yield 0 + progress = None + if not force_internal_tools: + progress = self.__rg(library_dir, ignore_patterns) # Use ripgrep if it was found and working, else fallback to wcmatch. - if dir_list is not None: - return self.__rg_add(library_dir, dir_list) - else: - return self.__wc_add(library_dir, ignore_to_glob(ignore_patterns)) + if progress is None: + progress = self.__wc(library_dir, ignore_patterns) + yield from progress - def __get_dir_list(self, library_dir: Path, ignore_patterns: list[str]) -> list[str] | None: + def __rg(self, library_dir: Path, ignore_patterns: list[str]) -> Iterator[int] | None: """Use ripgrep to return a list of matched directories and files. Return `None` if ripgrep not found on system. @@ -92,6 +156,7 @@ def __get_dir_list(self, library_dir: Path, ignore_patterns: list[str]) -> list[ with open(compiled_ignore_path, "w") as pattern_file: pattern_file.write("\n".join(ignore_patterns)) + start_time = time() result = silent_run( " ".join( [ @@ -105,102 +170,64 @@ def __get_dir_list(self, library_dir: Path, ignore_patterns: list[str]) -> list[ ), cwd=library_dir, capture_output=True, - text=True, shell=True, ) + logger.info( + "[Refresh]: ripgrep scan time", + duration=(time() - start_time), + ) compiled_ignore_path.unlink() if result.stderr: logger.error(result.stderr) - return result.stdout.splitlines() # pyright: ignore [reportReturnType] + paths = set(result.stdout.decode(sys.stdout.encoding).splitlines()) + self.__add(library_dir, paths) + yield len(paths) + return None logger.warning("[Refresh: ripgrep not found on system]") return None - def __rg_add(self, library_dir: Path, dir_list: list[str]) -> Iterator[int]: - start_time_total = time() - start_time_loop = time() - dir_file_count = 0 - self.files_not_in_library = [] - - for r in dir_list: - f = pathlib.Path(r) - - end_time_loop = time() - # Yield output every 1/30 of a second - if (end_time_loop - start_time_loop) > 0.034: - yield dir_file_count - start_time_loop = time() - - # Skip if the file/path is already mapped in the Library - if f in self.library.included_files: - dir_file_count += 1 - continue - - # Ignore if the file is a directory - if f.is_dir(): - continue + def __wc(self, library_dir: Path, ignore_patterns: list[str]) -> Iterator[int]: + logger.info("[Refresh]: Falling back to wcmatch for scanning") - dir_file_count += 1 - self.library.included_files.add(f) + ignore_patterns = ignore_to_glob(ignore_patterns) + try: + paths = set() - if not self.library.has_path_entry(f): - self.files_not_in_library.append(f) + start_time = time() + search = glob.iglob( + "***/*", root_dir=library_dir, flags=PATH_GLOB_FLAGS, exclude=ignore_patterns + ) + for i, path in enumerate(search): + if i < 100 or (i % 100) == 0: + yield i + paths.add(path) + logger.info( + "[Refresh]: wcmatch scan time", + duration=(time() - start_time), + ) + yield len(paths) - end_time_total = time() - yield dir_file_count - logger.info( - "[Refresh]: Directory scan time", - path=library_dir, - duration=(end_time_total - start_time_total), - files_scanned=dir_file_count, - tool_used="ripgrep (system)", - ) + self.__add(library_dir, paths) + except ValueError: + logger.info("[Refresh]: ValueError when refreshing directory with wcmatch!") - def __wc_add(self, library_dir: Path, ignore_patterns: list[str]) -> Iterator[int]: + def __add(self, library_dir: Path, paths: set[str]): start_time_total = time() - start_time_loop = time() - dir_file_count = 0 - self.files_not_in_library = [] - logger.info("[Refresh]: Falling back to wcmatch for scanning") - - try: - for f in pathlib.Path(str(library_dir)).glob( - "***/*", flags=PATH_GLOB_FLAGS, exclude=ignore_patterns - ): - end_time_loop = time() - # Yield output every 1/30 of a second - if (end_time_loop - start_time_loop) > 0.034: - yield dir_file_count - start_time_loop = time() - - # Skip if the file/path is already mapped in the Library - if f in self.library.included_files: - dir_file_count += 1 - continue - - # Ignore if the file is a directory - if f.is_dir(): - continue - - dir_file_count += 1 - self.library.included_files.add(f) - - relative_path = f.relative_to(library_dir) - - if not self.library.has_path_entry(relative_path): - self.files_not_in_library.append(relative_path) - except ValueError: - logger.info("[Refresh]: ValueError when refreshing directory with wcmatch!") + new = paths.difference(self._expected_paths) + missing = self._expected_paths.difference(paths) + self._new_paths = [Path(p) for p in new] + self._missing_paths = dict((p, self._paths_to_id[p]) for p in missing) end_time_total = time() - yield dir_file_count logger.info( "[Refresh]: Directory scan time", path=library_dir, duration=(end_time_total - start_time_total), - files_scanned=dir_file_count, - tool_used="wcmatch (internal)", + files_scanned=len(paths), + missing=len(self._missing_paths), + new=len(self._new_paths), ) diff --git a/src/tagstudio/qt/controllers/library_scanner_controller.py b/src/tagstudio/qt/controllers/library_scanner_controller.py new file mode 100644 index 000000000..7f73dc46d --- /dev/null +++ b/src/tagstudio/qt/controllers/library_scanner_controller.py @@ -0,0 +1,117 @@ +from typing import TYPE_CHECKING + +from PySide6.QtCore import QThreadPool +from PySide6.QtWidgets import QWidget + +from tagstudio.core.library.refresh import RefreshTracker +from tagstudio.core.utils.types import unwrap +from tagstudio.qt.mixed.progress_bar import ProgressWidget +from tagstudio.qt.mixed.unlinked_entries_modal import UnlinkedEntriesModal +from tagstudio.qt.translations import Translations +from tagstudio.qt.utils.custom_runnable import CustomRunnable +from tagstudio.qt.utils.function_iterator import FunctionIterator + +if TYPE_CHECKING: + from tagstudio.core.library.alchemy.library import Library + from tagstudio.qt.ts_qt import QtDriver + + +class LibraryScannerController(QWidget): + def __init__(self, driver: "QtDriver", lib: "Library"): + super().__init__() + self.driver = driver + self.lib = lib + self.tracker = RefreshTracker(lib) + self.unlinked_modal = UnlinkedEntriesModal(self.driver, self) + + @property + def unlinked_entries_count(self) -> int: + return self.tracker.missing_files_count + + @property + def new_files_count(self) -> int: + return self.tracker.new_files_count + + @property + def unlinked_paths(self) -> list[str]: + return list(self.tracker._missing_paths.keys()) + + def _progress_bar(self, pw: ProgressWidget, iterator, on_update, on_finish): + pw.show() + iterator = FunctionIterator(iterator) + iterator.value.connect(on_update) + r = CustomRunnable(iterator.run) + r.done.connect(lambda: (pw.hide(), pw.deleteLater(), on_finish())) + QThreadPool.globalInstance().start(r) + + def scan(self, on_finish=None): + pw = ScanProgressWidget() + + def default_on_finish(): + if self.tracker.missing_files_count > 0: + self.open_unlinked_view() + else: + self.save_new_files() + + library_dir = unwrap(self.lib.library_dir) + self._progress_bar( + pw, + iterator=lambda: self.tracker.refresh_dir(library_dir), + on_update=lambda i: pw.on_update(i), + on_finish=on_finish or default_on_finish, + ) + + def open_unlinked_view(self): + self.unlinked_modal.show() + + def save_new_files(self): + files_to_save = self.tracker.new_files_count + if files_to_save == 0: + return + + pw = SaveNewProgressWidget(files_to_save) + self._progress_bar( + pw, + iterator=self.tracker.save_new_files, + on_update=lambda i: pw.on_update(i), + on_finish=lambda: files_to_save and self.driver.update_browsing_state(), + ) + + +class ScanProgressWidget(ProgressWidget): + def __init__(self): + super().__init__( + cancel_button_text=None, + minimum=0, + maximum=0, + window_title=Translations["library.refresh.title"], + label_text=Translations["library.refresh.scanning_preparing"], + ) + + def on_update(self, files_searched: int): + self.update_label( + Translations.format( + "library.refresh.scanning.singular" + if files_searched == 1 + else "library.refresh.scanning.plural", + searched_count=f"{files_searched:n}", + found_count=0, # New files are found after scan in single step so no progress + ) + ) + + +class SaveNewProgressWidget(ProgressWidget): + def __init__(self, files_to_save: int): + super().__init__( + cancel_button_text=None, + minimum=0, + maximum=files_to_save, + window_title=Translations["entries.running.dialog.title"], + label_text=Translations.format("library.refresh.scanning_preparing", total=0), + ) + + def on_update(self, files_saved: int): + self.update_progress(files_saved) + self.update_label( + Translations.format("entries.running.dialog.new_entries", total=f"{files_saved:n}") + ) diff --git a/src/tagstudio/qt/mixed/fix_unlinked.py b/src/tagstudio/qt/mixed/fix_unlinked.py deleted file mode 100644 index 63a1affc6..000000000 --- a/src/tagstudio/qt/mixed/fix_unlinked.py +++ /dev/null @@ -1,165 +0,0 @@ -# Copyright (C) 2025 Travis Abendshien (CyanVoxel). -# Licensed under the GPL-3.0 License. -# Created for TagStudio: https://github.com/CyanVoxel/TagStudio - - -from typing import TYPE_CHECKING, override - -from PySide6 import QtCore, QtGui -from PySide6.QtCore import Qt -from PySide6.QtWidgets import QHBoxLayout, QLabel, QPushButton, QVBoxLayout, QWidget - -from tagstudio.core.library.alchemy.library import Library -from tagstudio.core.library.alchemy.registries.unlinked_registry import UnlinkedRegistry -from tagstudio.qt.mixed.merge_dupe_entries import MergeDuplicateEntries -from tagstudio.qt.mixed.progress_bar import ProgressWidget -from tagstudio.qt.mixed.relink_entries_modal import RelinkUnlinkedEntries -from tagstudio.qt.mixed.remove_unlinked_modal import RemoveUnlinkedEntriesModal -from tagstudio.qt.translations import Translations - -# Only import for type checking/autocompletion, will not be imported at runtime. -if TYPE_CHECKING: - from tagstudio.qt.ts_qt import QtDriver - - -# TODO: Break up into MVC classes, similar to fix_ignored_modal -class FixUnlinkedEntriesModal(QWidget): - def __init__(self, library: "Library", driver: "QtDriver"): - super().__init__() - self.lib = library - self.driver = driver - - self.tracker = UnlinkedRegistry(lib=self.lib) - - self.unlinked_count = -1 - self.dupe_count = -1 - self.setWindowTitle(Translations["entries.unlinked.title"]) - self.setWindowModality(Qt.WindowModality.ApplicationModal) - self.setMinimumSize(400, 300) - self.root_layout = QVBoxLayout(self) - self.root_layout.setContentsMargins(6, 6, 6, 6) - - self.unlinked_desc_widget = QLabel(Translations["entries.unlinked.description"]) - self.unlinked_desc_widget.setObjectName("unlinkedDescriptionLabel") - self.unlinked_desc_widget.setWordWrap(True) - self.unlinked_desc_widget.setStyleSheet("text-align:left;") - - self.unlinked_count_label = QLabel() - self.unlinked_count_label.setObjectName("unlinkedCountLabel") - self.unlinked_count_label.setAlignment(Qt.AlignmentFlag.AlignCenter) - - self.dupe_count_label = QLabel() - self.dupe_count_label.setObjectName("dupeCountLabel") - self.dupe_count_label.setAlignment(Qt.AlignmentFlag.AlignCenter) - - self.refresh_unlinked_button = QPushButton(Translations["entries.generic.refresh_alt"]) - self.refresh_unlinked_button.clicked.connect(self.refresh_unlinked) - - self.merge_class = MergeDuplicateEntries(self.lib, self.driver) - self.relink_class = RelinkUnlinkedEntries(self.tracker) - - self.search_button = QPushButton(Translations["entries.unlinked.search_and_relink"]) - self.relink_class.done.connect( - # refresh the grid - lambda: ( - self.driver.update_browsing_state(), - self.refresh_unlinked(), - ) - ) - self.search_button.clicked.connect(self.relink_class.repair_entries) - - self.manual_button = QPushButton(Translations["entries.unlinked.relink.manual"]) - self.manual_button.setHidden(True) - - self.remove_button = QPushButton(Translations["entries.unlinked.remove_alt"]) - self.remove_modal = RemoveUnlinkedEntriesModal(self.driver, self.tracker) - self.remove_modal.done.connect( - lambda: ( - self.set_unlinked_count(), - # refresh the grid - self.driver.update_browsing_state(), - self.refresh_unlinked(), - ) - ) - self.remove_button.clicked.connect(self.remove_modal.show) - - self.button_container = QWidget() - self.button_layout = QHBoxLayout(self.button_container) - self.button_layout.setContentsMargins(6, 6, 6, 6) - self.button_layout.addStretch(1) - - self.done_button = QPushButton(Translations["generic.done_alt"]) - self.done_button.setDefault(True) - self.done_button.clicked.connect(self.hide) - self.button_layout.addWidget(self.done_button) - - self.root_layout.addWidget(self.unlinked_count_label) - self.root_layout.addWidget(self.unlinked_desc_widget) - self.root_layout.addWidget(self.refresh_unlinked_button) - self.root_layout.addWidget(self.search_button) - self.root_layout.addWidget(self.manual_button) - self.root_layout.addWidget(self.remove_button) - self.root_layout.addStretch(1) - self.root_layout.addStretch(2) - self.root_layout.addWidget(self.button_container) - - self.update_unlinked_count() - - def refresh_unlinked(self): - pw = ProgressWidget( - cancel_button_text=None, - minimum=0, - maximum=self.lib.entries_count, - ) - pw.setWindowTitle(Translations["library.scan_library.title"]) - pw.update_label(Translations["entries.unlinked.scanning"]) - - def update_driver_widgets(): - if ( - hasattr(self.driver, "library_info_window") - and self.driver.library_info_window.isVisible() - ): - self.driver.library_info_window.update_cleanup() - - pw.from_iterable_function( - self.tracker.refresh_unlinked_files, - None, - self.set_unlinked_count, - self.update_unlinked_count, - self.remove_modal.refresh_list, - update_driver_widgets, - ) - - def set_unlinked_count(self): - """Sets the unlinked_entries_count in the Library to the tracker's value.""" - self.lib.unlinked_entries_count = self.tracker.unlinked_entries_count - - def update_unlinked_count(self): - """Updates the UI to reflect the Library's current unlinked_entries_count.""" - # Indicates that the library is new compared to the last update. - # NOTE: Make sure set_unlinked_count() is called before this! - if self.tracker.unlinked_entries_count > 0 and self.lib.unlinked_entries_count < 0: - self.tracker.reset() - - count: int = self.lib.unlinked_entries_count - - self.search_button.setDisabled(count < 1) - self.remove_button.setDisabled(count < 1) - - count_text: str = Translations.format( - "entries.unlinked.unlinked_count", count=count if count >= 0 else "—" - ) - self.unlinked_count_label.setText(f"