Coverage for src/gitlabracadabra/packages/pypi.py: 79%

119 statements  

« prev     ^ index     » next       coverage.py v7.8.0, created at 2025-04-23 06:44 +0200

1# 

2# Copyright (C) 2019-2025 Mathieu Parent <math.parent@gmail.com> 

3# 

4# This program is free software: you can redistribute it and/or modify 

5# it under the terms of the GNU Lesser General Public License as published by 

6# the Free Software Foundation, either version 3 of the License, or 

7# (at your option) any later version. 

8# 

9# This program is distributed in the hope that it will be useful, 

10# but WITHOUT ANY WARRANTY; without even the implied warranty of 

11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

12# GNU Lesser General Public License for more details. 

13# 

14# You should have received a copy of the GNU Lesser General Public License 

15# along with this program. If not, see <http://www.gnu.org/licenses/>. 

16 

17from __future__ import annotations 

18 

19from html import unescape 

20from logging import getLogger 

21from posixpath import join as posixpath_join 

22from typing import TYPE_CHECKING, Any 

23from urllib.parse import quote as urlquote 

24from urllib.parse import urljoin, urlparse, urlunparse 

25from urllib.request import parse_keqv_list 

26 

27from html5lib import parse as html5lib_parse 

28from packaging.requirements import InvalidRequirement, Requirement 

29from packaging.utils import canonicalize_name 

30from packaging.version import InvalidVersion, Version 

31from requests import codes 

32 

33from gitlabracadabra.packages.package_file import PackageFile 

34from gitlabracadabra.packages.pip import extract_version_from_fragment 

35from gitlabracadabra.packages.source import Source 

36 

37if TYPE_CHECKING: 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 from requests.models import Response 

39 

40 from gitlabracadabra.packages.destination import Destination 

41 

42try: 

43 from packaging.utils import parse_wheel_filename 

44 

45 HAS_PACKAGING_PARSERS = True 

46except ImportError: # packaging << 20.9 

47 HAS_PACKAGING_PARSERS = False 

48 

49logger = getLogger(__name__) 

50 

51 

52class PyPI(Source): 

53 """PyPI repository.""" 

54 

55 def __init__( 

56 self, 

57 *, 

58 log_prefix: str = "", 

59 index_url: str | None = None, 

60 requirements: str | list[str], 

61 ) -> None: 

62 """Initialize a PyPI repository object. 

63 

64 Args: 

65 log_prefix: Log prefix. 

66 index_url: index-url (default to https://pypi.org/simple). 

67 requirements: Python requirements as list or string. 

68 """ 

69 super().__init__() 

70 self._log_prefix = log_prefix 

71 self._index_url = index_url or "https://pypi.org/simple" 

72 if isinstance(requirements, str): 

73 self._requirements = requirements.splitlines() 

74 else: 

75 self._requirements = [req for reqs in requirements for req in reqs.splitlines()] 

76 

77 def __str__(self) -> str: 

78 """Return string representation. 

79 

80 Returns: 

81 A string. 

82 """ 

83 return "PyPI repository" 

84 

85 def package_files( 

86 self, 

87 destination: Destination, # noqa: ARG002 

88 ) -> list[PackageFile]: 

89 """Return list of package files. 

90 

91 Returns: 

92 List of package files. 

93 """ 

94 package_files: list[PackageFile] = [] 

95 if not HAS_PACKAGING_PARSERS: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 logger.error( 

97 "%sPyPI packages mirroring requires packaging >= 20.9", 

98 self._log_prefix, 

99 ) 

100 return package_files 

101 for requirement_string in self._requirements: 

102 if requirement_string.lstrip().startswith("#"): 

103 continue 

104 package_files_from_requirement_string = self._package_files_from_requirement_string(requirement_string) 

105 if not package_files_from_requirement_string: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 logger.warning( 

107 "%sNo package files matching found for requirement: %s", 

108 self._log_prefix, 

109 requirement_string, 

110 ) 

111 package_files.extend(package_files_from_requirement_string) 

112 return package_files 

113 

114 def _package_files_from_requirement_string(self, requirement_string: str) -> list[PackageFile]: 

115 try: 

116 req = Requirement(requirement_string) 

117 except InvalidRequirement: 

118 logger.warning( 

119 '%sInvalid requirement "%s"', 

120 self._log_prefix, 

121 requirement_string, 

122 ) 

123 return [] 

124 return self._package_files_from_requirement(req) 

125 

126 def _package_files_from_requirement(self, req: Requirement) -> list[PackageFile]: 

127 index_url = self._get_index_url(req.name) 

128 index_response = self.session.request("get", index_url) 

129 if index_response.status_code != codes["ok"]: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true

130 logger.warning( 

131 "%sUnexpected HTTP status for PyPI index %s: received %i %s", 

132 self._log_prefix, 

133 index_url, 

134 index_response.status_code, 

135 index_response.reason, 

136 ) 

137 return [] 

138 return self._package_files_from_requirement_and_response(req, index_response) 

139 

140 def _get_index_url(self, project_name: str) -> str: 

141 loc = posixpath_join( 

142 self._index_url, 

143 urlquote(canonicalize_name(project_name)), 

144 ) 

145 if not loc.endswith("/"): 145 ↛ 147line 145 didn't jump to line 147 because the condition on line 145 was always true

146 loc = f"{loc}/" 

147 return loc 

148 

149 def _package_files_from_requirement_and_response( 

150 self, 

151 req: Requirement, 

152 response: Response, 

153 ) -> list[PackageFile]: 

154 document = html5lib_parse( 

155 response.content, 

156 transport_encoding=response.encoding, 

157 namespaceHTMLElements=False, 

158 ) 

159 

160 base_url = self._get_base_url(response, document) 

161 

162 package_files: dict[Version, list[PackageFile]] = {} 

163 for anchor in document.findall(".//a"): 

164 version, package_file = self._package_file_from_requirement_and_anchor(req, anchor, base_url) 

165 if version and package_file: 

166 if version not in package_files: 

167 package_files[version] = [] 

168 package_files[version].append(package_file) 

169 

170 try: 

171 best_match = sorted(package_files, reverse=True)[0] 

172 except IndexError: 

173 return [] 

174 return package_files[best_match] 

175 

176 def _get_base_url(self, response: Response, document: Any) -> str: 

177 base_url = response.url 

178 for base in document.findall(".//base"): 178 ↛ 179line 178 didn't jump to line 179 because the loop on line 178 never started

179 href = base.get("href") 

180 if href is not None: 

181 base_url = href 

182 break 

183 return base_url 

184 

185 def _package_file_from_requirement_and_anchor( 

186 self, 

187 req: Requirement, 

188 anchor: Any, 

189 base_url: str, 

190 ) -> tuple[Version | None, PackageFile | None]: 

191 if "href" not in anchor.keys(): # noqa: SIM118 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 return None, None 

193 if anchor.get("data-yanked") and not str(req.specifier).startswith("=="): 

194 return None, None 

195 

196 parsed_url = urlparse(urljoin(base_url, anchor.get("href"))) 

197 

198 filename = parsed_url.path.split("/")[-1] 

199 try: 

200 name, ver = self._parse_filename(filename, canonicalize_name(req.name)) 

201 except InvalidVersion: 

202 # Ignore invalid versions, like in pbr-0.5.2.5.g5b3e942.tar.gz 

203 logger.debug( 

204 "%sIgnoring invalid version for filename %s", 

205 self._log_prefix, 

206 filename, 

207 ) 

208 return None, None 

209 

210 if name is None or ver is None or ver not in req.specifier: 

211 return None, None 

212 

213 metadata = parse_keqv_list(parsed_url.fragment.split("&")) 

214 

215 if "data-requires-python" in anchor.keys(): # noqa: SIM118 

216 metadata["requires-python"] = unescape(anchor.get("data-requires-python")) 

217 

218 return ver, PackageFile( 

219 urlunparse(parsed_url._replace(fragment="")), 

220 "pypi", 

221 name, 

222 str(ver), 

223 filename, 

224 metadata=metadata, 

225 ) 

226 

227 def _parse_filename(self, filename: str, canonical_name: str) -> tuple[str | None, Version | None]: 

228 if filename.endswith(".whl"): 

229 name, ver, _, _ = parse_wheel_filename(filename) 

230 return name, ver 

231 if filename.endswith(".egg"): 

232 # Ignore egg files for now 

233 return None, None 

234 if filename.endswith(".tar.gz"): 234 ↛ 238line 234 didn't jump to line 238 because the condition on line 234 was always true

235 ver_str = extract_version_from_fragment(filename[:-7], canonical_name) 

236 if ver_str: 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 return canonical_name, Version(ver_str) 

238 return None, None