Source code for file.collection

from datetime import timedelta
from depot.io.utils import file_from_content
from onegov.file.models import File, FileSet
from onegov.file.utils import as_fileintent, digest
from sedate import utcnow
from sqlalchemy import and_, text, or_


from typing import overload, Any, Generic, IO, Literal, TypeVar, TYPE_CHECKING
if TYPE_CHECKING:
    from datetime import datetime
    from onegov.file.types import SignatureMetadata
    from sqlalchemy.orm import Query, Session


[docs] FileT = TypeVar('FileT', bound=File)
[docs] FileSetT = TypeVar('FileSetT', bound=FileSet)
[docs] class FileCollection(Generic[FileT]): """ Manages files. :param session: The SQLAlchemy db session to use. :param type: The polymorphic type to use and to filter for, or '*' for all. :param allow_duplicates: Prevents duplicates if set to false. Duplicates are detected before pre-processing, so already stored files may be downloaded and added again, as they might have changed during the upload. Note that this does not change existing files. It only prevents new duplicates from being added. """ @overload def __init__( self: 'FileCollection[File]', session: 'Session', type: Literal['*', 'generic'] = '*', allow_duplicates: bool = True ) -> None: ... @overload def __init__( self, session: 'Session', type: str, allow_duplicates: bool = True ) -> None: ... def __init__( self, session: 'Session', type: str = '*', allow_duplicates: bool = True ) -> None:
[docs] self.session = session
[docs] self.type = type
[docs] self.allow_duplicates = allow_duplicates
[docs] def query(self) -> 'Query[FileT]': if self.type != '*': model_class = File.get_polymorphic_class(self.type, File) # FIXME: this is a weird singularity, which happens to not cause # any issues since our inheritance structure never inherits # from a subclass of File, we should be consistent about # what filterting by type means, does it mean exactly that # type or does it also allow subclasses? if model_class is File: return self.session.query( # type:ignore[return-value] File).filter_by(type=self.type) return self.session.query(model_class) return self.session.query(File)
[docs] def add( self, filename: str, content: bytes | IO[bytes], note: str | None = None, published: bool = True, publish_date: 'datetime | None' = None, publish_end_date: 'datetime | None' = None ) -> FileT: """ Adds a file with the given filename. The content maybe either in bytes or a file object. """ if not self.allow_duplicates: self.assert_not_duplicate(content) type_ = self.type if self.type != '*' else 'generic' file: FileT file = File.get_polymorphic_class(type_, File)() # type:ignore file.name = filename file.note = note file.type = type_ file.published = published file.publish_date = publish_date file.publish_end_date = publish_end_date file.reference = as_fileintent(content, filename) self.session.add(file) self.session.flush() return file
[docs] def replace(self, file: File, content: bytes | IO[bytes]) -> None: """ Replaces the content of the given file with the new content. """ if not self.allow_duplicates: self.assert_not_duplicate(content) file.reference = as_fileintent(content, file.name) self.session.flush()
[docs] def assert_not_duplicate(self, content: bytes | IO[bytes]) -> None: existing = self.by_content(content).first() if existing: raise FileExistsError(existing)
[docs] def delete(self, file: File) -> None: self.session.delete(file) self.session.flush()
[docs] def no_longer_published_files( self, horizon: 'datetime' ) -> 'Query[FileT]': """ Returns a query of files where the publishing end date has expired. """ return self.query().filter(and_( File.published.is_(True), File.publish_end_date < horizon ))
[docs] def publishable_files(self, horizon: 'datetime') -> 'Query[FileT]': """ Returns a query of files which may be published. """ return self.query().filter(and_( File.published.is_(False), File.publish_date <= horizon, or_( File.publish_end_date.is_(None), File.publish_end_date > horizon ) ))
[docs] def publish_files(self, horizon: 'datetime | None' = None) -> None: """ Publishes unpublished files with a publish date older than the given horizon. """ # default to a horizon slightly into the future as this method is # usually called by cronjob which is not perfectly on time horizon = horizon or (utcnow() + timedelta(seconds=90)) for fi in self.no_longer_published_files(horizon): fi.published = False # technically this should already be None, but we still set # it to make absolutely sure we don't republish it right # after, because it still had a publish date in the past fi.publish_date = None fi.publish_end_date = None self.session.flush() for f in self.publishable_files(horizon): f.published = True f.publish_date = None self.session.flush()
[docs] def by_id(self, file_id: str) -> FileT | None: """ Returns the file with the given id or None. """ return self.query().filter(File.id == file_id).first()
[docs] def by_filename(self, filename: str) -> 'Query[FileT]': """ Returns a query that matches the files with the given filename. Be aware that there may be multiple files with the same filename! """ return self.query().filter(File.name == filename)
[docs] def by_checksum(self, checksum: str) -> 'Query[FileT]': """ Returns a query that matches the given checksum (may be more than one record). """ return self.query().filter(File.checksum == checksum)
[docs] def by_content(self, content: bytes | IO[bytes]) -> 'Query[FileT]': """ Returns a query that matches the given content (may be more than one record). """ close, file = file_from_content(content) # we need to look up two checksums, the one of the file stored and # possibly the one it had before signing # # XXX maybe it makes sense to combine those into a data structure # that holds all kinds of digests a file is known under # checksum md5 = digest(file, 'md5') # old_digest of signature_metadata sha = digest(file, 'sha256') if close: file.close() return self.query().filter(or_( File.checksum == md5, File.signature_metadata['old_digest'].astext == sha ))
[docs] def by_content_type(self, content_type: str) -> 'Query[FileT]': """ Returns a query that matches the given MIME content type (may be more than one record). """ return self.query().filter( text("reference->>'content_type' = :content_type").bindparams( content_type=content_type ) )
[docs] def by_signature_digest(self, digest: str) -> 'Query[FileT]': """ Returns a query that matches the given digest in the signature metadata. In other words, given a digest this function will find signed files that match the digest - either before or after signing. Unsigned files are ignored. The digest is expected to be a SHA256 hex. """ return self.query().filter_by(signed=True).filter( or_( text("signature_metadata->>'old_digest' = :digest").bindparams( digest=digest ), text("signature_metadata->>'new_digest' = :digest").bindparams( digest=digest ) ) )
[docs] def locate_signature_metadata( self, digest: str ) -> 'SignatureMetadata | None': """ Looks for the given digest in the files table - if that doesn't work it will go through the audit trail (i.e. the chat messages) and see if the digest can be found there. If this database was ever used to sign a file with the given digest, or if a file that was signed had the given digest, this function will find it - barring manual database manipulation in the messages log. """ match = self.by_signature_digest(digest).with_entities( File.signature_metadata).first() if match: return match.signature_metadata from onegov.chat import Message # circular import match = self.session.query(Message).filter_by(type='file').filter(or_( text( "meta->'action_metadata'->>'old_digest' = :digest" ).bindparams(digest=digest), text( "meta->'action_metadata'->>'new_digest' = :digest" ).bindparams(digest=digest) )).with_entities(Message.meta).first() return match.meta['action_metadata'] if match else None
[docs] class FileSetCollection(Generic[FileSetT]): """ Manages filesets. """ @overload def __init__( self: 'FileSetCollection[FileSet]', session: 'Session', type: Literal['*', 'generic'] = '*' ) -> None: ... @overload def __init__(self, session: 'Session', type: str) -> None: ... def __init__(self, session: 'Session', type: str = '*') -> None:
[docs] self.session = session
[docs] self.type = type
[docs] def query(self) -> 'Query[FileSetT]': if self.type != '*': model_class = FileSet.get_polymorphic_class(self.type, FileSet) # FIXME: Same weird sigularity as with FileCollection if model_class is FileSet: return self.session.query( # type:ignore[return-value] FileSet).filter_by(type=self.type) return self.session.query(model_class) return self.session.query(FileSet)
[docs] def add( self, title: str, meta: dict[str, Any] | None = None, content: dict[str, Any] | None = None ) -> FileSetT: type_ = self.type if self.type != '*' else 'generic' fileset: FileSetT fileset = FileSet.get_polymorphic_class( # type:ignore[assignment] type_, FileSet)() fileset.title = title fileset.type = type_ if meta is not None: fileset.meta = meta if content is not None: fileset.content = content self.session.add(fileset) self.session.flush() return fileset
[docs] def delete(self, fileset: FileSet) -> None: self.session.delete(fileset)
[docs] def by_id(self, fileset_id: str) -> FileSetT | None: """ Returns the fileset with the given id or None. """ return self.query().filter(FileSet.id == fileset_id).first()