Source code for renku.core.models.entity

# -*- coding: utf-8 -*-
#
# Copyright 2018-2021- Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Represent provenance entities."""

import os.path
from pathlib import Path
from typing import List, Union
from urllib.parse import quote

from renku.core.metadata.immutable import Immutable
from renku.core.models.calamus import JsonLDSchema, Nested, fields, prov, renku
from renku.core.utils.git import get_object_hash

_entity_cache = {}


[docs]class Entity(Immutable): """Represent a file.""" __slots__ = ("checksum", "path") def __init__(self, *, checksum: str, id: str = None, path: Union[Path, str], **kwargs): path = str(path) assert id is None or isinstance(id, str) assert not os.path.isabs(path), f"Entity is being created with absolute path: '{path}'" id = id or Entity.generate_id(checksum, path) super().__init__(checksum=checksum, id=id, path=path, **kwargs) def __eq__(self, other): if self is other: return True if not isinstance(other, Entity): return False return self.checksum == other.checksum and self.path == other.path def __hash__(self): return hash((self.checksum, self.path))
[docs] @staticmethod def generate_id(checksum: str, path: Union[Path, str]) -> str: """Generate an Entity identifier.""" quoted_path = quote(str(path).strip("/")) return f"/entities/{checksum}/{quoted_path}"
[docs] @classmethod def from_revision(cls, client, path: Union[Path, str], revision: str = None) -> "Entity": """Return dependency from given path and revision.""" revision = revision or "HEAD" assert isinstance(revision, str), f"Invalid revision: {revision}" # FIXME: Implement the cache properly: It's mainly used in migrations global _entity_cache key = (revision, str(path)) cached_entry = _entity_cache.get(key) if cached_entry: return cached_entry # TODO: What checksum we get at "HEAD" if object is staged but not committed checksum = get_object_hash(repo=client.repo, revision=revision, path=path) # NOTE: If object was not found at a revision it's either removed or exists in a different revision; keep the # entity and use revision as checksum checksum = checksum or revision # TODO: What would be checksum for a directory if it's not committed yet. id = cls.generate_id(checksum=checksum, path=path) absolute_path = client.path / path if str(path) != "." and absolute_path.is_dir(): members = cls.get_directory_members(client, revision, absolute_path) entity = Collection(id=id, checksum=checksum, path=path, members=members) else: entity = cls(id=id, checksum=checksum, path=path) _entity_cache[cached_entry] = entity return entity
[docs] @classmethod def get_directory_members(cls, client, revision, absolute_path: Path) -> List["Entity"]: """Return first-level files/directories in a directory.""" members: List[Entity] = [] for member in absolute_path.iterdir(): if member.name == ".gitkeep": continue member_path = str(member.relative_to(client.path)) try: assert all(member_path != m.path for m in members) entity = cls.from_revision(client, member_path, revision) # NOTE: If a path is not found at a revision we assume that it didn't exist at that revision if entity: members.append(entity) except KeyError: pass return members
[docs]class Collection(Entity): """Represent a directory with files.""" __slots__ = ("members",) def __init__(self, *, checksum: str, id: str = None, path: Union[Path, str], members: List[Entity] = None): members = tuple(members) if members else () super().__init__(checksum=checksum, id=id, path=path, members=members)
class EntitySchema(JsonLDSchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = [prov.Entity] model = Entity checksum = fields.String(renku.checksum, missing=None) id = fields.Id() path = fields.String(prov.atLocation) class CollectionSchema(EntitySchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = prov.Collection model = Collection members = Nested(prov.hadMember, [EntitySchema, "CollectionSchema"], many=True)