From ad93d54bf3aa83ccafb2159c40573fcb252f8bfc Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sat, 20 Jul 2024 14:13:50 -0400 Subject: [PATCH 1/2] cache "concrete" dists by Distribution instead of InstallRequirement --- news/12863.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 5 + src/pip/_internal/distributions/base.py | 19 +++- src/pip/_internal/distributions/installed.py | 14 ++- src/pip/_internal/distributions/sdist.py | 20 +++- src/pip/_internal/distributions/wheel.py | 4 +- src/pip/_internal/metadata/base.py | 21 ++++ .../_internal/metadata/importlib/_dists.py | 19 +++- src/pip/_internal/metadata/importlib/_envs.py | 8 +- src/pip/_internal/metadata/pkg_resources.py | 15 ++- src/pip/_internal/operations/prepare.py | 47 +++++---- src/pip/_internal/req/req_install.py | 96 ++++++++++++++----- .../resolution/resolvelib/resolver.py | 2 +- .../metadata/test_metadata_pkg_resources.py | 1 + tests/unit/test_req.py | 31 +++++- 15 files changed, 231 insertions(+), 72 deletions(-) create mode 100644 news/12863.trivial.rst diff --git a/news/12863.trivial.rst b/news/12863.trivial.rst new file mode 100644 index 00000000000..dc36a82a0df --- /dev/null +++ b/news/12863.trivial.rst @@ -0,0 +1 @@ +Cache "concrete" dists by ``Distribution`` instead of ``InstallRequirement``. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index 9a89a838b9a..f6089daf40d 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -1,4 +1,5 @@ from pip._internal.distributions.base import AbstractDistribution +from pip._internal.distributions.installed import InstalledDistribution from pip._internal.distributions.sdist import SourceDistribution from pip._internal.distributions.wheel import WheelDistribution from pip._internal.req.req_install import InstallRequirement @@ -8,6 +9,10 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: """Returns a Distribution for the given InstallRequirement""" + # Only pre-installed requirements will have a .satisfied_by dist. + if install_req.satisfied_by: + return InstalledDistribution(install_req) + # Editable requirements will always be source distributions. They use the # legacy logic until we create a modern standard for them. if install_req.editable: diff --git a/src/pip/_internal/distributions/base.py b/src/pip/_internal/distributions/base.py index 6e4d0c91a90..0a132b88f98 100644 --- a/src/pip/_internal/distributions/base.py +++ b/src/pip/_internal/distributions/base.py @@ -37,11 +37,17 @@ def build_tracker_id(self) -> Optional[str]: If None, then this dist has no work to do in the build tracker, and ``.prepare_distribution_metadata()`` will not be called.""" - raise NotImplementedError() + ... @abc.abstractmethod def get_metadata_distribution(self) -> BaseDistribution: - raise NotImplementedError() + """Generate a concrete ``BaseDistribution`` instance for this artifact. + + The implementation should also cache the result with + ``self.req.cache_concrete_dist()`` so the distribution is available to other + users of the ``InstallRequirement``. This method is not called within the build + tracker context, so it should not identify any new setup requirements.""" + ... @abc.abstractmethod def prepare_distribution_metadata( @@ -50,4 +56,11 @@ def prepare_distribution_metadata( build_isolation: bool, check_build_deps: bool, ) -> None: - raise NotImplementedError() + """Generate the information necessary to extract metadata from the artifact. + + This method will be executed within the context of ``BuildTracker#track()``, so + it needs to fully identify any setup requirements so they can be added to the + same active set of tracked builds, while ``.get_metadata_distribution()`` takes + care of generating and caching the ``BaseDistribution`` to expose to the rest of + the resolve.""" + ... diff --git a/src/pip/_internal/distributions/installed.py b/src/pip/_internal/distributions/installed.py index ab8d53be740..83e99fca9ca 100644 --- a/src/pip/_internal/distributions/installed.py +++ b/src/pip/_internal/distributions/installed.py @@ -1,9 +1,11 @@ -from typing import Optional +from typing import TYPE_CHECKING, Optional from pip._internal.distributions.base import AbstractDistribution -from pip._internal.index.package_finder import PackageFinder from pip._internal.metadata import BaseDistribution +if TYPE_CHECKING: + from pip._internal.index.package_finder import PackageFinder + class InstalledDistribution(AbstractDistribution): """Represents an installed package. @@ -17,12 +19,14 @@ def build_tracker_id(self) -> Optional[str]: return None def get_metadata_distribution(self) -> BaseDistribution: - assert self.req.satisfied_by is not None, "not actually installed" - return self.req.satisfied_by + dist = self.req.satisfied_by + assert dist is not None, "not actually installed" + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, - finder: PackageFinder, + finder: "PackageFinder", build_isolation: bool, check_build_deps: bool, ) -> None: diff --git a/src/pip/_internal/distributions/sdist.py b/src/pip/_internal/distributions/sdist.py index 28ea5cea16c..3e4a3e8d371 100644 --- a/src/pip/_internal/distributions/sdist.py +++ b/src/pip/_internal/distributions/sdist.py @@ -1,10 +1,10 @@ import logging -from typing import TYPE_CHECKING, Iterable, Optional, Set, Tuple +from typing import TYPE_CHECKING, Iterable, Set, Tuple from pip._internal.build_env import BuildEnvironment from pip._internal.distributions.base import AbstractDistribution from pip._internal.exceptions import InstallationError -from pip._internal.metadata import BaseDistribution +from pip._internal.metadata import BaseDistribution, get_directory_distribution from pip._internal.utils.subprocess import runner_with_spinner_message if TYPE_CHECKING: @@ -21,13 +21,19 @@ class SourceDistribution(AbstractDistribution): """ @property - def build_tracker_id(self) -> Optional[str]: + def build_tracker_id(self) -> str: """Identify this requirement uniquely by its link.""" assert self.req.link return self.req.link.url_without_fragment def get_metadata_distribution(self) -> BaseDistribution: - return self.req.get_dist() + assert ( + self.req.metadata_directory + ), "Set as part of .prepare_distribution_metadata()" + dist = get_directory_distribution(self.req.metadata_directory) + self.req.cache_concrete_dist(dist) + self.req.validate_sdist_metadata() + return dist def prepare_distribution_metadata( self, @@ -66,7 +72,11 @@ def prepare_distribution_metadata( self._raise_conflicts("the backend dependencies", conflicting) if missing: self._raise_missing_reqs(missing) - self.req.prepare_metadata() + + # NB: we must still call .cache_concrete_dist() and .validate_sdist_metadata() + # before the InstallRequirement itself has been updated with the metadata from + # this directory! + self.req.prepare_metadata_directory() def _prepare_build_backend(self, finder: "PackageFinder") -> None: # Isolate in a BuildEnvironment and install the build-time diff --git a/src/pip/_internal/distributions/wheel.py b/src/pip/_internal/distributions/wheel.py index bfadd39dcb7..e75c7910379 100644 --- a/src/pip/_internal/distributions/wheel.py +++ b/src/pip/_internal/distributions/wheel.py @@ -31,7 +31,9 @@ def get_metadata_distribution(self) -> BaseDistribution: assert self.req.local_file_path, "Set as part of preparation during download" assert self.req.name, "Wheels are never unnamed" wheel = FilesystemWheel(self.req.local_file_path) - return get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + dist = get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 9eabcdb278b..7e8c5b6f177 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -97,6 +97,15 @@ class RequiresEntry(NamedTuple): class BaseDistribution(Protocol): + @property + def is_concrete(self) -> bool: + """Whether the distribution really exists somewhere on disk. + + If this is false, it has been synthesized from metadata, e.g. via + ``.from_metadata_file_contents()``, or ``.from_wheel()`` against + a ``MemoryWheel``.""" + raise NotImplementedError() + @classmethod def from_directory(cls, directory: str) -> "BaseDistribution": """Load the distribution from a metadata directory. @@ -667,6 +676,10 @@ def iter_installed_distributions( class Wheel(Protocol): location: str + @property + def is_concrete(self) -> bool: + raise NotImplementedError() + def as_zipfile(self) -> zipfile.ZipFile: raise NotImplementedError() @@ -675,6 +688,10 @@ class FilesystemWheel(Wheel): def __init__(self, location: str) -> None: self.location = location + @property + def is_concrete(self) -> bool: + return True + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.location, allowZip64=True) @@ -684,5 +701,9 @@ def __init__(self, location: str, stream: IO[bytes]) -> None: self.location = location self.stream = stream + @property + def is_concrete(self) -> bool: + return False + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.stream, allowZip64=True) diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index 36cd326232e..63b5ff1a012 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -102,16 +102,22 @@ def __init__( dist: importlib.metadata.Distribution, info_location: Optional[BasePath], installed_location: Optional[BasePath], + concrete: bool, ) -> None: self._dist = dist self._info_location = info_location self._installed_location = installed_location + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: info_location = pathlib.Path(directory) dist = importlib.metadata.Distribution.at(info_location) - return cls(dist, info_location, info_location.parent) + return cls(dist, info_location, info_location.parent, concrete=True) @classmethod def from_metadata_file_contents( @@ -128,7 +134,7 @@ def from_metadata_file_contents( metadata_path.write_bytes(metadata_contents) # Construct dist pointing to the newly created directory. dist = importlib.metadata.Distribution.at(metadata_path.parent) - return cls(dist, metadata_path.parent, None) + return cls(dist, metadata_path.parent, None, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -137,7 +143,14 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: dist = WheelDistribution.from_zipfile(zf, name, wheel.location) except zipfile.BadZipFile as e: raise InvalidWheel(wheel.location, name) from e - return cls(dist, dist.info_location, pathlib.PurePosixPath(wheel.location)) + except UnsupportedWheel as e: + raise UnsupportedWheel(f"{name} has an invalid wheel, {e}") + return cls( + dist, + dist.info_location, + pathlib.PurePosixPath(wheel.location), + concrete=wheel.is_concrete, + ) @property def location(self) -> Optional[str]: diff --git a/src/pip/_internal/metadata/importlib/_envs.py b/src/pip/_internal/metadata/importlib/_envs.py index 70cb7a6009a..b7355d4935c 100644 --- a/src/pip/_internal/metadata/importlib/_envs.py +++ b/src/pip/_internal/metadata/importlib/_envs.py @@ -80,7 +80,7 @@ def find(self, location: str) -> Iterator[BaseDistribution]: installed_location: Optional[BasePath] = None else: installed_location = info_location.parent - yield Distribution(dist, info_location, installed_location) + yield Distribution(dist, info_location, installed_location, concrete=True) def find_linked(self, location: str) -> Iterator[BaseDistribution]: """Read location in egg-link files and return distributions in there. @@ -104,7 +104,7 @@ def find_linked(self, location: str) -> Iterator[BaseDistribution]: continue target_location = str(path.joinpath(target_rel)) for dist, info_location in self._find_impl(target_location): - yield Distribution(dist, info_location, path) + yield Distribution(dist, info_location, path, concrete=True) def _find_eggs_in_dir(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_distributions @@ -116,7 +116,7 @@ def _find_eggs_in_dir(self, location: str) -> Iterator[BaseDistribution]: if not entry.name.endswith(".egg"): continue for dist in find_distributions(entry.path): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def _find_eggs_in_zip(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_eggs_in_zip @@ -128,7 +128,7 @@ def _find_eggs_in_zip(self, location: str) -> Iterator[BaseDistribution]: except zipimport.ZipImportError: return for dist in find_eggs_in_zip(importer, location): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def find_eggs(self, location: str) -> Iterator[BaseDistribution]: """Find eggs in a location. diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index 4ea84f93a6f..428b1f4a7dd 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -81,8 +81,9 @@ def run_script(self, script_name: str, namespace: str) -> None: class Distribution(BaseDistribution): - def __init__(self, dist: pkg_resources.Distribution) -> None: + def __init__(self, dist: pkg_resources.Distribution, concrete: bool) -> None: self._dist = dist + self._concrete = concrete # This is populated lazily, to avoid loading metadata for all possible # distributions eagerly. self.__extra_mapping: Optional[Mapping[NormalizedName, str]] = None @@ -96,6 +97,10 @@ def _extra_mapping(self) -> Mapping[NormalizedName, str]: return self.__extra_mapping + @property + def is_concrete(self) -> bool: + return self._concrete + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: dist_dir = directory.rstrip(os.sep) @@ -114,7 +119,7 @@ def from_directory(cls, directory: str) -> BaseDistribution: dist_name = os.path.splitext(dist_dir_name)[0].split("-")[0] dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata) - return cls(dist) + return cls(dist, concrete=True) @classmethod def from_metadata_file_contents( @@ -131,7 +136,7 @@ def from_metadata_file_contents( metadata=InMemoryMetadata(metadata_dict, filename), project_name=project_name, ) - return cls(dist) + return cls(dist, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -152,7 +157,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: metadata=InMemoryMetadata(metadata_dict, wheel.location), project_name=name, ) - return cls(dist) + return cls(dist, concrete=wheel.is_concrete) @property def location(self) -> Optional[str]: @@ -264,7 +269,7 @@ def from_paths(cls, paths: Optional[List[str]]) -> BaseEnvironment: def _iter_distributions(self) -> Iterator[BaseDistribution]: for dist in self._ws: - yield Distribution(dist) + yield Distribution(dist, concrete=True) def _search_distribution(self, name: str) -> Optional[BaseDistribution]: """Find a distribution matching the ``name`` in the environment. diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index e6aa3447200..7ad301a3c06 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -14,7 +14,6 @@ from pip._vendor.packaging.utils import canonicalize_name from pip._internal.distributions import make_distribution_for_install_requirement -from pip._internal.distributions.installed import InstalledDistribution from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, HashMismatch, @@ -190,6 +189,8 @@ def _check_download_dir( ) -> Optional[str]: """Check download_dir for previously downloaded file with correct hash If a correct file is found return its path else None + + If a file is found at the given path, but with an invalid hash, the file is deleted. """ download_path = os.path.join(download_dir, link.filename) @@ -520,7 +521,9 @@ def prepare_linked_requirement( # The file is not available, attempt to fetch only metadata metadata_dist = self._fetch_metadata_only(req) if metadata_dist is not None: - req.needs_more_preparation = True + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) return metadata_dist # None of the optimizations worked, fully prepare the requirement @@ -530,27 +533,27 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - reqs = [req for req in reqs if req.needs_more_preparation] + partially_downloaded_reqs: List[InstallRequirement] = [] for req in reqs: + if req.is_concrete: + continue + # Determine if any of these requirements were already downloaded. if self.download_dir is not None and req.link.is_wheel: hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir(req.link, self.download_dir, hashes) + # If the file is there, but doesn't match the hash, delete it and print + # a warning. We will be downloading it again via + # partially_downloaded_reqs. + file_path = _check_download_dir( + req.link, self.download_dir, hashes, warn_on_hash_mismatch=True + ) if file_path is not None: + # If the hash does match, then we still need to generate a concrete + # dist, but we don't have to download the wheel again. self._downloaded[req.link.url] = file_path - req.needs_more_preparation = False - # Prepare requirements we found were already downloaded for some - # reason. The other downloads will be completed separately. - partially_downloaded_reqs: List[InstallRequirement] = [] - for req in reqs: - if req.needs_more_preparation: - partially_downloaded_reqs.append(req) - else: - self._prepare_linked_requirement(req, parallel_builds) + partially_downloaded_reqs.append(req) - # TODO: separate this part out from RequirementPreparer when the v1 - # resolver can be removed! self._complete_partial_requirements( partially_downloaded_reqs, parallel_builds=parallel_builds, @@ -651,6 +654,7 @@ def _prepare_linked_requirement( def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None + assert req.is_concrete link = req.link if link.is_vcs or (link.is_existing_dir() and req.editable): # Make a .zip of the source_dir we already created. @@ -705,6 +709,8 @@ def prepare_editable_requirement( req.check_if_exists(self.use_user_site) + # This should already have been populated by the preparation of the source dist. + assert req.is_concrete return dist def prepare_installed_requirement( @@ -729,4 +735,13 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - return InstalledDistribution(req).get_metadata_distribution() + dist = _get_prepared_distribution( + req, + self.build_tracker, + self.finder, + self.build_isolation, + self.check_build_deps, + ) + + assert req.is_concrete + return dist diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 834bc513356..2710142f26a 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -7,7 +7,16 @@ import zipfile from optparse import Values from pathlib import Path -from typing import Any, Collection, Dict, Iterable, List, Optional, Sequence, Union +from typing import ( + TYPE_CHECKING, + Collection, + Dict, + Iterable, + List, + Optional, + Sequence, + Union, +) from pip._vendor.packaging.markers import Marker from pip._vendor.packaging.requirements import Requirement @@ -23,10 +32,7 @@ from pip._internal.metadata import ( BaseDistribution, get_default_environment, - get_directory_distribution, - get_wheel_distribution, ) -from pip._internal.metadata.base import FilesystemWheel from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.operations.build.metadata import generate_metadata @@ -59,6 +65,9 @@ from pip._internal.utils.virtualenv import running_under_virtualenv from pip._internal.vcs import vcs +if TYPE_CHECKING: + import email.message + logger = logging.getLogger(__name__) @@ -150,6 +159,7 @@ def __init__( self.hash_options = hash_options if hash_options else {} self.config_settings = config_settings # Set to True after successful preparation of this requirement + # TODO: this is only used in the legacy resolver: remove this! self.prepared = False # User supplied requirement are explicitly requested for installation # by the user via CLI arguments or requirements files, as opposed to, @@ -191,8 +201,11 @@ def __init__( ) self.use_pep517 = True - # This requirement needs more preparation before it can be built - self.needs_more_preparation = False + # When a dist is computed for this requirement, cache it here so it's visible + # everywhere within pip and isn't computed more than once. This may be + # a "virtual" dist without a physical location on the filesystem, or + # a "concrete" dist which has been fully downloaded. + self._dist: Optional[BaseDistribution] = None # This requirement needs to be unpacked before it can be installed. self._archive_source: Optional[Path] = None @@ -550,11 +563,11 @@ def isolated_editable_sanity_check(self) -> None: f"Consider using a build backend that supports PEP 660." ) - def prepare_metadata(self) -> None: + def prepare_metadata_directory(self) -> None: """Ensure that project metadata is available. - Under PEP 517 and PEP 660, call the backend hook to prepare the metadata. - Under legacy processing, call setup.py egg-info. + Under PEP 517 and PEP 660, call the backend hook to prepare the metadata + directory. Under legacy processing, call setup.py egg-info. """ assert self.source_dir, f"No source dir for {self}" details = self.name or f"from {self.link}" @@ -586,6 +599,8 @@ def prepare_metadata(self) -> None: details=details, ) + def validate_sdist_metadata(self) -> None: + """Ensure that we have a dist, and ensure it corresponds to expectations.""" # Act on the newly generated metadata, based on the name and version. if not self.name: self._set_requirement() @@ -595,25 +610,54 @@ def prepare_metadata(self) -> None: self.assert_source_matches_version() @property - def metadata(self) -> Any: - if not hasattr(self, "_metadata"): - self._metadata = self.get_dist().metadata - - return self._metadata + def metadata(self) -> "email.message.Message": + return self.get_dist().metadata def get_dist(self) -> BaseDistribution: - if self.metadata_directory: - return get_directory_distribution(self.metadata_directory) - elif self.local_file_path and self.is_wheel: - assert self.req is not None - return get_wheel_distribution( - FilesystemWheel(self.local_file_path), - canonicalize_name(self.req.name), - ) - raise AssertionError( - f"InstallRequirement {self} has no metadata directory and no wheel: " - f"can't make a distribution." - ) + """Retrieve the dist resolved from this requirement. + + :raises AssertionError: if the resolver has not yet been executed. + """ + if self._dist is None: + raise AssertionError(f"{self!r} has no dist associated.") + return self._dist + + def cache_virtual_metadata_only_dist(self, dist: BaseDistribution) -> None: + """Associate a "virtual" metadata-only dist to this requirement. + + This dist cannot be installed, but it can be used to complete the resolve + process. + + :raises AssertionError: if a dist has already been associated. + :raises AssertionError: if the provided dist is "concrete", i.e. exists + somewhere on the filesystem. + """ + assert self._dist is None, self + assert not dist.is_concrete, dist + self._dist = dist + + def cache_concrete_dist(self, dist: BaseDistribution) -> None: + """Associate a "concrete" dist to this requirement. + + A concrete dist exists somewhere on the filesystem and can be installed. + + :raises AssertionError: if a concrete dist has already been associated. + :raises AssertionError: if the provided dist is not concrete. + """ + if self._dist is not None: + # If we set a dist twice for the same requirement, we must be hydrating + # a concrete dist for what was previously virtual. This will occur in the + # case of `install --dry-run` when PEP 658 metadata is available. + + # TODO(#12186): avoid setting dist twice! + # assert not self._dist.is_concrete + pass + assert dist.is_concrete + self._dist = dist + + @property + def is_concrete(self) -> bool: + return self._dist is not None and self._dist.is_concrete def assert_source_matches_version(self) -> None: assert self.source_dir, f"No source dir for {self}" diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index c12beef0b2a..c22d1707b5e 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -179,7 +179,7 @@ def resolve( self.factory.preparer.prepare_linked_requirements_more(reqs) for req in reqs: req.prepared = True - req.needs_more_preparation = False + assert req.is_concrete return req_set def get_installation_order( diff --git a/tests/unit/metadata/test_metadata_pkg_resources.py b/tests/unit/metadata/test_metadata_pkg_resources.py index 6044c14e4ca..195442cbe1e 100644 --- a/tests/unit/metadata/test_metadata_pkg_resources.py +++ b/tests/unit/metadata/test_metadata_pkg_resources.py @@ -104,6 +104,7 @@ def test_wheel_metadata_works() -> None: metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, ""), project_name=name, ), + concrete=False, ) assert name == dist.canonical_name == dist.raw_name diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index 8a95c058706..5757850c6b3 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -23,6 +23,7 @@ PreviousBuildDirError, ) from pip._internal.index.package_finder import PackageFinder +from pip._internal.metadata import get_metadata_distribution from pip._internal.models.direct_url import ArchiveInfo, DirectUrl, DirInfo, VcsInfo from pip._internal.models.link import Link from pip._internal.network.session import PipSession @@ -144,7 +145,11 @@ def test_no_reuse_existing_build_dir(self, data: TestData) -> None: ): resolver.resolve(reqset.all_requirements, True) - def test_environment_marker_extras(self, data: TestData) -> None: + def test_environment_marker_extras( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """ Test that the environment marker extras are used with non-wheel installs. @@ -154,6 +159,13 @@ def test_environment_marker_extras(self, data: TestData) -> None: os.fspath(data.packages.joinpath("LocalEnvironMarker")), ) req.user_supplied = True + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + req, "cache_concrete_dist", partial(cache_concrete_dist, req) + ) reqset.add_unnamed_requirement(req) finder = make_test_finder(find_links=[data.find_links]) with self._basic_resolver(finder) as resolver: @@ -499,12 +511,23 @@ def test_download_info_local_dir(self, data: TestData) -> None: assert req.download_info.url.startswith("file://") assert isinstance(req.download_info.info, DirInfo) - def test_download_info_local_editable_dir(self, data: TestData) -> None: + def test_download_info_local_editable_dir( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """Test that download_info is set for requirements from a local editable dir.""" finder = make_test_finder() with self._basic_resolver(finder) as resolver: ireq_url = data.packages.joinpath("FSPkg").as_uri() ireq = get_processed_req_from_line(f"-e {ireq_url}#egg=FSPkg") + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + ireq, "cache_concrete_dist", partial(cache_concrete_dist, ireq) + ) reqset = resolver.resolve([ireq], True) assert len(reqset.all_requirements) == 1 req = reqset.all_requirements[0] @@ -909,7 +932,9 @@ def test_mismatched_versions(caplog: pytest.LogCaptureFixture) -> None: metadata = email.message.Message() metadata["name"] = "simplewheel" metadata["version"] = "1.0" - req._metadata = metadata + req._dist = get_metadata_distribution( + bytes(metadata), "simplewheel-1.0.whl", "simplewheel" + ) req.assert_source_matches_version() assert caplog.records[-1].message == ( From 52133baf6c9d41f6264f64e1e71e93116ac75b4b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:09:04 -0400 Subject: [PATCH 2/2] refactor much of RequirementPreparer and add lengthy docs --- news/12871.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 9 +- src/pip/_internal/operations/prepare.py | 324 +++++++++++--------- 3 files changed, 181 insertions(+), 153 deletions(-) create mode 100644 news/12871.trivial.rst diff --git a/news/12871.trivial.rst b/news/12871.trivial.rst new file mode 100644 index 00000000000..186e2bcb3c4 --- /dev/null +++ b/news/12871.trivial.rst @@ -0,0 +1 @@ +Refactor much of ``RequirementPreparer`` to avoid duplicated code paths for metadata-only requirements. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index f6089daf40d..a870b227d1e 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -8,7 +8,14 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: - """Returns a Distribution for the given InstallRequirement""" + """Returns an AbstractDistribution for the given InstallRequirement. + + As AbstractDistribution only covers installable artifacts, this method may only be + invoked at the conclusion of a resolve, when the RequirementPreparer has downloaded + the file corresponding to the resolved dist. Commands which intend to consume + metadata-only resolves without downloading should not call this method or + consume AbstractDistribution objects. + """ # Only pre-installed requirements will have a .satisfied_by dist. if install_req.satisfied_by: return InstalledDistribution(install_req) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 7ad301a3c06..17e0c9c28da 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -9,7 +9,7 @@ import shutil from dataclasses import dataclass from pathlib import Path -from typing import Dict, Iterable, List, Optional +from typing import Dict, Iterable, Optional from pip._vendor.packaging.utils import canonicalize_name @@ -63,7 +63,17 @@ def _get_prepared_distribution( build_isolation: bool, check_build_deps: bool, ) -> BaseDistribution: - """Prepare a distribution for installation.""" + """Prepare a distribution for installation. + + This method will only be called by the preparer at the end of the resolve, and only + for commands which need installable artifacts (not just resolved metadata). If the + dist was previously metadata-only, the preparer must have downloaded the file + corresponding to the dist and set ``req.local_file_path``. + + This method will execute ``req.cache_concrete_dist()``, so that after invocation, + ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete + ``Distribution``. + """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id if tracker_id is not None: @@ -305,11 +315,7 @@ def _ensure_link_req_src_dir( self, req: InstallRequirement, parallel_builds: bool ) -> None: """Ensure source_dir of a linked InstallRequirement.""" - # Since source_dir is only set for editable requirements. - if req.link.is_wheel: - # We don't need to unpack wheels, so no need for a source - # directory. - return + assert not req.link.is_wheel assert req.source_dir is None if req.link.is_existing_dir(): # build local directories in-tree @@ -356,6 +362,47 @@ def _get_linked_req_hashes(self, req: InstallRequirement) -> Hashes: # showing the user what the hash should be. return req.hashes(trust_internet=False) or MissingHashes() + def _rewrite_link_and_hashes_for_cached_wheel( + self, req: InstallRequirement, hashes: Hashes + ) -> Optional[Hashes]: + """Check the hash of the requirement's source eagerly and rewrite its link. + + ``req.link`` is unconditionally rewritten to the cached wheel source so that the + requirement corresponds to where it was actually downloaded from instead of the + local cache entry. + + :returns: None if the source hash validated successfully. + """ + assert hashes + assert req.is_wheel_from_cache + assert req.download_info is not None + assert req.link.is_wheel + assert req.link.is_file + + # TODO: is it possible to avoid providing a "wrong" req.link in the first place + # in the resolver, instead of having to patch it up afterwards? + req.link = req.cached_wheel_source_link + + # We need to verify hashes, and we have found the requirement in the cache + # of locally built wheels. + if ( + isinstance(req.download_info.info, ArchiveInfo) + and req.download_info.info.hashes + and hashes.has_one_of(req.download_info.info.hashes) + ): + # At this point we know the requirement was built from a hashable source + # artifact, and we verified that the cache entry's hash of the original + # artifact matches one of the hashes we expect. We don't verify hashes + # against the cached wheel, because the wheel is not the original. + return None + + logger.warning( + "The hashes of the source archive found in cache entry " + "don't match, ignoring cached built wheel " + "and re-downloading source." + ) + return hashes + def _fetch_metadata_only( self, req: InstallRequirement, @@ -447,7 +494,7 @@ def _fetch_metadata_using_lazy_wheel( def _complete_partial_requirements( self, - partially_downloaded_reqs: Iterable[InstallRequirement], + metadata_only_reqs: Iterable[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -458,10 +505,9 @@ def _complete_partial_requirements( # Map each link to the requirement that owns it. This allows us to set # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. - links_to_fully_download: Dict[Link, InstallRequirement] = {} - for req in partially_downloaded_reqs: - assert req.link - links_to_fully_download[req.link] = req + links_to_fully_download: Dict[Link, InstallRequirement] = { + req.link: req for req in metadata_only_reqs + } batch_download = self._batch_download( links_to_fully_download.keys(), @@ -486,9 +532,33 @@ def _complete_partial_requirements( # This step is necessary to ensure all lazy wheels are processed # successfully by the 'download', 'wheel', and 'install' commands. - for req in partially_downloaded_reqs: + for req in metadata_only_reqs: self._prepare_linked_requirement(req, parallel_builds) + def _check_download_path(self, req: InstallRequirement) -> None: + """Check if the relevant file is already available in the download directory. + + If so, check its hash, and delete the file if the hash doesn't match.""" + if self.download_dir is None: + return + if not req.link.is_wheel: + return + + hashes = self._get_linked_req_hashes(req) + if file_path := _check_download_dir( + req.link, + self.download_dir, + hashes, + # When a locally built wheel has been found in cache, we don't warn + # about re-downloading when the already downloaded wheel hash does + # not match. This is because the hash must be checked against the + # original link, not the cached link. It that case the already + # downloaded file will be removed and re-fetched from cache (which + # implies a hash check against the cache entry's origin.json). + warn_on_hash_mismatch=not req.is_wheel_from_cache, + ): + self._downloaded[req.link.url] = file_path + def prepare_linked_requirement( self, req: InstallRequirement, parallel_builds: bool = False ) -> BaseDistribution: @@ -496,35 +566,15 @@ def prepare_linked_requirement( assert req.link self._log_preparing_link(req) with indent_log(): - # Check if the relevant file is already available - # in the download directory - file_path = None - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir( - req.link, - self.download_dir, - hashes, - # When a locally built wheel has been found in cache, we don't warn - # about re-downloading when the already downloaded wheel hash does - # not match. This is because the hash must be checked against the - # original link, not the cached link. It that case the already - # downloaded file will be removed and re-fetched from cache (which - # implies a hash check against the cache entry's origin.json). - warn_on_hash_mismatch=not req.is_wheel_from_cache, - ) + # See if the file is already downloaded, and check its hash if so. + self._check_download_path(req) - if file_path is not None: - # The file is already available, so mark it as downloaded - self._downloaded[req.link.url] = file_path - else: - # The file is not available, attempt to fetch only metadata - metadata_dist = self._fetch_metadata_only(req) - if metadata_dist is not None: - # These reqs now have the dependency information from the downloaded - # metadata, without having downloaded the actual dist at all. - req.cache_virtual_metadata_only_dist(metadata_dist) - return metadata_dist + # First try to fetch only metadata. + if metadata_dist := self._fetch_metadata_only(req): + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) + return metadata_dist # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) @@ -533,73 +583,37 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - partially_downloaded_reqs: List[InstallRequirement] = [] - for req in reqs: - if req.is_concrete: - continue - - # Determine if any of these requirements were already downloaded. - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - # If the file is there, but doesn't match the hash, delete it and print - # a warning. We will be downloading it again via - # partially_downloaded_reqs. - file_path = _check_download_dir( - req.link, self.download_dir, hashes, warn_on_hash_mismatch=True - ) - if file_path is not None: - # If the hash does match, then we still need to generate a concrete - # dist, but we don't have to download the wheel again. - self._downloaded[req.link.url] = file_path - - partially_downloaded_reqs.append(req) - + metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( - partially_downloaded_reqs, + metadata_only_reqs, parallel_builds=parallel_builds, ) - def _prepare_linked_requirement( - self, req: InstallRequirement, parallel_builds: bool - ) -> BaseDistribution: - assert req.link - link = req.link - - hashes = self._get_linked_req_hashes(req) - - if hashes and req.is_wheel_from_cache: - assert req.download_info is not None - assert link.is_wheel - assert link.is_file - # We need to verify hashes, and we have found the requirement in the cache - # of locally built wheels. - if ( - isinstance(req.download_info.info, ArchiveInfo) - and req.download_info.info.hashes - and hashes.has_one_of(req.download_info.info.hashes) - ): - # At this point we know the requirement was built from a hashable source - # artifact, and we verified that the cache entry's hash of the original - # artifact matches one of the hashes we expect. We don't verify hashes - # against the cached wheel, because the wheel is not the original. - hashes = None - else: - logger.warning( - "The hashes of the source archive found in cache entry " - "don't match, ignoring cached built wheel " - "and re-downloading source." - ) - req.link = req.cached_wheel_source_link - link = req.link + def _ensure_local_file_path( + self, req: InstallRequirement, hashes: Optional[Hashes] + ) -> None: + """Ensure that ``req.link`` is downloaded locally, matches the expected hash, + and that ``req.local_file_path`` is set to the download location.""" + if req.link.is_existing_dir(): + return - self._ensure_link_req_src_dir(req, parallel_builds) + # NB: req.local_file_path may be set already, if it was: + # (1) sourced from a local file (such as a local .tar.gz path), + # (2) also in the wheel cache (e.g. built from an sdist). + # We will overwrite it if so, since the local file path will still point to the + # .tar.gz source instead of the wheel cache entry. - if link.is_existing_dir(): - local_file = None - elif link.url not in self._downloaded: + local_file: Optional[File] = None + # The file may have already been downloaded in batch if it was + # a metadata-only requirement, or if it was already in the download directory. + if file_path := self._downloaded.get(req.link.url, None): + if hashes: + hashes.check_against_path(file_path) + local_file = File(file_path, content_type=None) + else: try: local_file = unpack_url( - link, + req.link, req.source_dir, self._download, self.verbosity, @@ -609,39 +623,13 @@ def _prepare_linked_requirement( except NetworkConnectionError as exc: raise InstallationError( f"Could not install requirement {req} because of HTTP " - f"error {exc} for URL {link}" - ) - else: - file_path = self._downloaded[link.url] - if hashes: - hashes.check_against_path(file_path) - local_file = File(file_path, content_type=None) + f"error {exc} for URL {req.link}" + ) from exc - # If download_info is set, we got it from the wheel cache. - if req.download_info is None: - # Editables don't go through this function (see - # prepare_editable_requirement). - assert not req.editable - req.download_info = direct_url_from_link(link, req.source_dir) - # Make sure we have a hash in download_info. If we got it as part of the - # URL, it will have been verified and we can rely on it. Otherwise we - # compute it from the downloaded file. - # FIXME: https://github.com/pypa/pip/issues/11943 - if ( - isinstance(req.download_info.info, ArchiveInfo) - and not req.download_info.info.hashes - and local_file - ): - hash = hash_file(local_file.path)[0].hexdigest() - # We populate info.hash for backward compatibility. - # This will automatically populate info.hashes. - req.download_info.info.hash = f"sha256={hash}" - - # For use in later processing, - # preserve the file path on the requirement. - if local_file: + if local_file is not None: req.local_file_path = local_file.path + def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: dist = _get_prepared_distribution( req, self.build_tracker, @@ -649,8 +637,60 @@ def _prepare_linked_requirement( self.build_isolation, self.check_build_deps, ) + assert dist.is_concrete + assert req.is_concrete + assert req.get_dist() is dist return dist + def _prepare_linked_requirement( + self, req: InstallRequirement, parallel_builds: bool + ) -> BaseDistribution: + """Ensure the dist pointing to the fully-resolved requirement is downloaded and + installable.""" + assert req.link, "this requirement must have a download link to fully prepare" + + hashes: Optional[Hashes] = self._get_linked_req_hashes(req) + + if hashes and req.is_wheel_from_cache: + hashes = self._rewrite_link_and_hashes_for_cached_wheel(req, hashes) + + # req.source_dir is only set for editable requirements. We don't need to unpack + # wheels, so no need for a source directory. + if not req.link.is_wheel: + self._ensure_link_req_src_dir(req, parallel_builds) + + # Ensure the dist is downloaded, check its hash, and unpack it into the source + # directory (if applicable). + self._ensure_local_file_path(req, hashes) + + # Set req.download_info for --report output. + if req.download_info is None: + # If download_info is set, we got it from the wheel cache. + self._ensure_download_info(req) + + # Build (if necessary) and prepare the distribution for installation. + return self._prepare_and_finalize_dist(req) + + def _ensure_download_info(self, req: InstallRequirement) -> None: + """Ensure that ``req.download_info`` is set, for uses such as --report.""" + assert req.download_info is None + # Editables don't go through this function (see prepare_editable_requirement). + assert not req.editable + req.download_info = direct_url_from_link(req.link, req.source_dir) + # Make sure we have a hash in download_info. If we got it as part of the + # URL, it will have been verified and we can rely on it. Otherwise we + # compute it from the downloaded file. + # FIXME: https://github.com/pypa/pip/issues/11943 + if ( + isinstance(req.download_info.info, ArchiveInfo) + and not req.download_info.info.hashes + and req.local_file_path + ): + hash = hash_file(req.local_file_path)[0].hexdigest() + # We populate info.hash for backward compatibility. + # This will automatically populate info.hashes. + req.download_info.info.hash = f"sha256={hash}" + def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None @@ -698,20 +738,9 @@ def prepare_editable_requirement( req.update_editable() assert req.source_dir req.download_info = direct_url_for_editable(req.unpacked_source_directory) - - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.finder, - self.build_isolation, - self.check_build_deps, - ) - req.check_if_exists(self.use_user_site) - # This should already have been populated by the preparation of the source dist. - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req) def prepare_installed_requirement( self, @@ -735,13 +764,4 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.finder, - self.build_isolation, - self.check_build_deps, - ) - - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req)