Skip to content

Commit 2f85457

Browse files
Add or update from a specific commit; LFS fixes (#805)
* feat: add or update dataset files to a specific git commit * track dataset files in LFS * Skip LFS smudge filter * docs: dataset add --ref
1 parent 16d03b6 commit 2f85457

File tree

4 files changed

+182
-16
lines changed

4 files changed

+182
-16
lines changed

renku/cli/dataset.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,10 @@
111111
new-dir/
112112
new-filename
113113
114+
To add a specific version of files, use ``--ref`` option for selecting a
115+
branch, commit, or tag. The value passed to this option must be a valid
116+
reference in the remote Git repository.
117+
114118
Tagging a dataset:
115119
116120
A dataset can be tagged with an arbitrary tag to refer to the dataset at that
@@ -404,7 +408,10 @@ def edit(dataset_id):
404408
default='',
405409
help='Destination file or directory within the dataset path'
406410
)
407-
def add(name, urls, link, force, create, sources, destination):
411+
@click.option(
412+
'--ref', default=None, help='Add files from a specific commit/tag/branch.'
413+
)
414+
def add(name, urls, link, force, create, sources, destination, ref):
408415
"""Add data to a dataset."""
409416
progress = partial(progressbar, label='Adding data to dataset')
410417
add_file(
@@ -415,6 +422,7 @@ def add(name, urls, link, force, create, sources, destination):
415422
create=create,
416423
sources=sources,
417424
destination=destination,
425+
ref=ref,
418426
urlscontext=progress
419427
)
420428

@@ -638,8 +646,11 @@ def _init(lock, id_queue):
638646
multiple=True,
639647
help='Exclude files matching given pattern.'
640648
)
641-
def update(names, creators, include, exclude):
649+
@click.option(
650+
'--ref', default=None, help='Update to a specific commit/tag/branch.'
651+
)
652+
def update(names, creators, include, exclude, ref):
642653
"""Updates files in dataset from a remote Git repo."""
643654
progress_context = partial(progressbar, label='Updating files')
644-
update_datasets(names, creators, include, exclude, progress_context)
655+
update_datasets(names, creators, include, exclude, ref, progress_context)
645656
click.secho('OK', fg='green')

renku/core/commands/dataset.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,13 @@ def add_file(
141141
create=False,
142142
sources=(),
143143
destination='',
144+
ref=None,
144145
with_metadata=None,
145146
urlscontext=contextlib.nullcontext
146147
):
147148
"""Add data file to a dataset."""
148149
add_to_dataset(
149-
client, urls, name, link, force, create, sources, destination,
150+
client, urls, name, link, force, create, sources, destination, ref,
150151
with_metadata, urlscontext
151152
)
152153

@@ -160,6 +161,7 @@ def add_to_dataset(
160161
create=False,
161162
sources=(),
162163
destination='',
164+
ref=None,
163165
with_metadata=None,
164166
urlscontext=contextlib.nullcontext
165167
):
@@ -180,6 +182,7 @@ def add_to_dataset(
180182
force=force,
181183
sources=sources,
182184
destination=destination,
185+
ref=ref
183186
)
184187

185188
if with_metadata:
@@ -511,6 +514,7 @@ def update_datasets(
511514
creators,
512515
include,
513516
exclude,
517+
ref,
514518
progress_context=contextlib.nullcontext
515519
):
516520
"""Update files from a remote Git repo."""
@@ -527,6 +531,7 @@ def update_datasets(
527531

528532
datasets = {}
529533
possible_updates = []
534+
unique_remotes = set()
530535

531536
for file_ in records:
532537
if file_.based_on:
@@ -539,11 +544,19 @@ def update_datasets(
539544

540545
file_.dataset = dataset
541546
possible_updates.append(file_)
547+
unique_remotes.add(file_.based_on['url'])
548+
549+
if ref and len(unique_remotes) > 1:
550+
raise ParameterError(
551+
'Cannot use "--ref" with more than one Git repository.\n'
552+
'Limit list of files to be updated to one repository. See'
553+
'"renku dataset update -h" for more information.'
554+
)
542555

543556
with progress_context(
544557
possible_updates, item_show_func=lambda x: x.path if x else None
545558
) as progressbar:
546-
client.update_dataset_files(progressbar)
559+
client.update_dataset_files(progressbar, ref)
547560

548561

549562
def _include_exclude(file_path, include=None, exclude=None):

renku/core/management/datasets.py

Lines changed: 47 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ def add_data_to_dataset(
179179
force=False,
180180
sources=(),
181181
destination='',
182+
ref=None,
182183
link=False
183184
):
184185
"""Import the data into the data directory."""
@@ -193,7 +194,7 @@ def add_data_to_dataset(
193194
sources = sources or ()
194195
files.extend(
195196
self._add_from_git(
196-
dataset, dataset_path, url, sources, destination
197+
dataset, dataset_path, url, sources, destination, ref
197198
)
198199
)
199200
else:
@@ -313,7 +314,9 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination):
313314
'parent': self
314315
}]
315316

316-
def _add_from_git(self, dataset, dataset_path, url, sources, destination):
317+
def _add_from_git(
318+
self, dataset, dataset_path, url, sources, destination, ref
319+
):
317320
"""Process adding resources from another git repository."""
318321
from renku import LocalClient
319322

@@ -357,7 +360,7 @@ def _add_from_git(self, dataset, dataset_path, url, sources, destination):
357360
'Scheme {} not supported'.format(u.scheme)
358361
)
359362

360-
repo, repo_path = self._prepare_git_repo(url)
363+
repo, repo_path = self._prepare_git_repo(url, ref)
361364

362365
dataset_path = self.path / dataset_path
363366

@@ -419,15 +422,19 @@ def _add_from_git(self, dataset, dataset_path, url, sources, destination):
419422
client, path=path, url=url
420423
)
421424

425+
path_in_dst_repo = dst.relative_to(self.path)
426+
422427
results.append({
423-
'path': dst.relative_to(self.path),
428+
'path': path_in_dst_repo,
424429
'url': dst_url,
425430
'creator': creators,
426431
'dataset': dataset.name,
427432
'parent': self,
428433
'based_on': based_on
429434
})
430435

436+
self.track_paths_in_storage(str(path_in_dst_repo))
437+
431438
dst.parent.mkdir(parents=True, exist_ok=True)
432439
shutil.copy(str(src), str(dst))
433440

@@ -611,7 +618,7 @@ def remove_dataset_tags(self, dataset, tags):
611618

612619
return dataset
613620

614-
def update_dataset_files(self, files):
621+
def update_dataset_files(self, files, ref=None):
615622
"""Update files and dataset metadata according to their remotes."""
616623
from renku import LocalClient
617624

@@ -625,7 +632,7 @@ def update_dataset_files(self, files):
625632
if url in visited_repos:
626633
repo, repo_path, remote_client = visited_repos[url]
627634
else:
628-
repo, repo_path = self._prepare_git_repo(url)
635+
repo, repo_path = self._prepare_git_repo(url, ref)
629636
remote_client = LocalClient(repo_path)
630637
visited_repos[url] = repo, repo_path, remote_client
631638

@@ -703,7 +710,19 @@ def update_dataset_files(self, files):
703710
for dataset in modified_datasets.values():
704711
dataset.to_yaml()
705712

706-
def _prepare_git_repo(self, url):
713+
def _prepare_git_repo(self, url, ref):
714+
def checkout(repo, ref):
715+
try:
716+
repo.git.checkout(ref)
717+
except GitCommandError:
718+
raise errors.ParameterError(
719+
'Cannot find reference "{}" in Git repository: {}'.format(
720+
ref, url
721+
)
722+
)
723+
724+
RENKU_BRANCH = 'renku-default-branch'
725+
ref = ref or RENKU_BRANCH
707726
u = GitURL.parse(url)
708727
path = u.pathname
709728
if u.hostname == 'localhost':
@@ -715,11 +734,11 @@ def _prepare_git_repo(self, url):
715734

716735
if repo_path.exists():
717736
repo = Repo(str(repo_path))
718-
origin = repo.remotes.origin
719-
if origin.url == url:
737+
if repo.remotes.origin.url == url:
720738
try:
721-
origin.fetch()
722-
origin.pull()
739+
repo.git.fetch()
740+
repo.git.checkout(ref)
741+
repo.git.pull()
723742
except GitError:
724743
# ignore the error and try re-cloning
725744
pass
@@ -735,7 +754,24 @@ def _prepare_git_repo(self, url):
735754
)
736755

737756
try:
757+
os.environ['GIT_LFS_SKIP_SMUDGE'] = '1'
738758
repo = Repo.clone_from(url, str(repo_path), recursive=True)
759+
# Because the name of the default branch is not always 'master', we
760+
# create an alias of the default branch when cloning the repo. It
761+
# is used to refer to the default branch later.
762+
renku_ref = 'refs/heads/' + RENKU_BRANCH
763+
repo.git.execute([
764+
'git', 'symbolic-ref', renku_ref, repo.head.reference.path
765+
])
766+
checkout(repo, ref)
767+
# Disable Git LFS smudge filter
768+
repo.git.execute(
769+
command=[
770+
'git', 'lfs', 'install', '--local', '--skip-smudge',
771+
'--force'
772+
],
773+
with_exceptions=False
774+
)
739775
except GitCommandError:
740776
raise errors.GitError(
741777
'Cannot access remote Git repo: {}'.format(url)

tests/cli/test_integration_datasets.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# limitations under the License.
1818
"""Integration tests for dataset command."""
1919
import os
20+
import subprocess
2021

2122
import git
2223
import pytest
@@ -624,3 +625,108 @@ def test_datasets_import_target(
624625
],
625626
)
626627
assert 0 == result.exit_code
628+
629+
630+
@pytest.mark.integration
631+
@pytest.mark.parametrize(
632+
'ref', ['v0.3.0', 'fe6ec65cc84bcf01e879ef38c0793208f7fab4bb']
633+
)
634+
def test_add_specific_refs(ref, runner, client):
635+
"""Test adding a specific version of files."""
636+
FILENAME = 'CHANGES.rst'
637+
# create a dataset
638+
result = runner.invoke(cli, ['dataset', 'create', 'dataset'])
639+
assert 0 == result.exit_code
640+
641+
# add data from a git repo
642+
result = runner.invoke(
643+
cli, [
644+
'dataset', 'add', 'dataset', '-s', FILENAME, '--ref', ref,
645+
'https://github.com/SwissDataScienceCenter/renku-python.git'
646+
]
647+
)
648+
assert 0 == result.exit_code
649+
content = (client.path / 'data' / 'dataset' / FILENAME).read_text()
650+
assert 'v0.3.0' in content
651+
assert 'v0.3.1' not in content
652+
653+
654+
@pytest.mark.integration
655+
@pytest.mark.parametrize(
656+
'ref', ['v0.3.1', '27e29abd409c83129a3fdb8b8b0b898b23bcb229']
657+
)
658+
def test_update_specific_refs(ref, runner, client):
659+
"""Test updating to a specific version of files."""
660+
FILENAME = 'CHANGES.rst'
661+
# create a dataset
662+
result = runner.invoke(cli, ['dataset', 'create', 'dataset'])
663+
assert 0 == result.exit_code
664+
665+
# add data from a git repo
666+
result = runner.invoke(
667+
cli, [
668+
'dataset', 'add', 'dataset', '-s', FILENAME, '--ref', 'v0.3.0',
669+
'https://github.com/SwissDataScienceCenter/renku-python.git'
670+
]
671+
)
672+
assert 0 == result.exit_code
673+
content = (client.path / 'data' / 'dataset' / FILENAME).read_text()
674+
assert 'v0.3.1' not in content
675+
676+
# update data to a later version
677+
result = runner.invoke(cli, ['dataset', 'update', '--ref', ref])
678+
assert 0 == result.exit_code
679+
content = (client.path / 'data' / 'dataset' / FILENAME).read_text()
680+
assert 'v0.3.1' in content
681+
assert 'v0.3.2' not in content
682+
683+
684+
@pytest.mark.integration
685+
def test_update_with_multiple_remotes_and_ref(runner, client):
686+
"""Test updating fails when ref is ambiguous."""
687+
# create a dataset
688+
result = runner.invoke(cli, ['dataset', 'create', 'dataset'])
689+
assert 0 == result.exit_code
690+
691+
# add data from a git repo
692+
result = runner.invoke(
693+
cli, [
694+
'dataset', 'add', 'dataset', '-s', 'CHANGES.rst',
695+
'https://github.com/SwissDataScienceCenter/renku-python.git'
696+
]
697+
)
698+
assert 0 == result.exit_code
699+
700+
# add data from another git repo
701+
result = runner.invoke(
702+
cli, [
703+
'dataset', 'add', 'dataset', '-s', 'LICENSE',
704+
'https://github.com/SwissDataScienceCenter/renku-notebooks.git'
705+
]
706+
)
707+
assert 0 == result.exit_code
708+
709+
# update data to a later version
710+
result = runner.invoke(cli, ['dataset', 'update', '--ref', 'any-value'])
711+
assert 2 == result.exit_code
712+
assert 'Cannot use "--ref" with more than one Git repo' in result.output
713+
714+
715+
@pytest.mark.integration
716+
def test_files_are_tracked_in_lfs(runner, client):
717+
"""Test files added from a Git repo are tacked in Git LFS."""
718+
FILENAME = 'CHANGES.rst'
719+
# create a dataset
720+
result = runner.invoke(cli, ['dataset', 'create', 'dataset'])
721+
assert 0 == result.exit_code
722+
723+
# add data from a git repo
724+
result = runner.invoke(
725+
cli, [
726+
'dataset', 'add', 'dataset', '-s', FILENAME,
727+
'https://github.com/SwissDataScienceCenter/renku-python.git'
728+
]
729+
)
730+
assert 0 == result.exit_code
731+
path = 'data/dataset/{}'.format(FILENAME)
732+
assert path in subprocess.check_output(['git', 'lfs', 'ls-files']).decode()

0 commit comments

Comments
 (0)