Source code for file.utils

import hashlib
import magic
import os

from contextlib import contextmanager, suppress
from depot.io.utils import FileIntent
from functools import lru_cache
from mimetypes import guess_extension
from io import IOBase, BytesIO, UnsupportedOperation
from lxml import etree
from PIL import Image


from typing import IO, TYPE_CHECKING
if TYPE_CHECKING:
    from _typeshed import SupportsRead, StrOrBytesPath
    from collections.abc import Iterator


[docs] def content_type_from_fileobj(fileobj: 'SupportsRead[bytes]') -> str: """ Gets the content type from a file obj. Depot has this as well, but it doesn't use python-magic. We use python-magic to be slower, but more accurate. """ if hasattr(fileobj, 'seek'): with suppress(UnsupportedOperation): fileobj.seek(0) return magic.from_buffer(fileobj.read(4096), mime=True)
[docs] def as_fileintent( content: bytes | IO[bytes], filename: str | None ) -> FileIntent: # this is far stricter than filedepot is, but our custom UploadedFile # requires more than just a read() to be implemented, this check won't # make mypy happy, since IOBase does not inherit from typing.IO # the minimum we could get away with right now is SupportsReadCloseSeek # which is not that far off what IO provides, so for simplicity let's # just use that msg = 'Content must be either a bytes string or a file-like object.' assert isinstance(content, (bytes, IOBase)), msg if isinstance(content, bytes): return FileIntent(BytesIO(content), filename, 'text/plain') else: if hasattr(content, 'mode'): # type: ignore[unreachable] assert 'b' in content.mode, 'Open file in binary mode.' if hasattr(content, 'seek'): content.seek(0) return FileIntent( content, filename, content_type_from_fileobj(content))
@lru_cache(maxsize=1)
[docs] def get_supported_image_mime_types() -> set[str]: """ Queries PIL for *all* locally supported mime types. Adapted from: https://github.com/python-pillow/Pillow/issues/1182#issuecomment-90572583 """ # Make sure all supported formats are registered. Image.init() # Not all PIL formats register a mime type, fill in the blanks ourselves. supported_types = { 'image/bmp', 'image/x-bmp', 'image/x-MS-bmp', 'image/x-icon', 'image/x-ico', 'image/x-win-bitmap', 'image/x-pcx', 'image/x-portable-pixmap', 'image/x-tga' } for mime in Image.MIME.values(): # exclude pdfs, postscripts and the like if not mime.startswith('application/'): supported_types.add(mime) return supported_types
# TODO: technically SupportsReadClose is enough
[docs] def get_svg_size(svg: IO[bytes]) -> tuple[str | None, str | None]: # note, the svg size may not be in pixel, it can include the same units # the browser uses for styling, so we need to pass this information down # to the browser, instead of using it internally root = etree.parse(svg).getroot() return root.get('width'), root.get('height')
[docs] def extension_for_content_type( content_type: str, filename: str | None = None ) -> str: """ Gets the extension for the given content type. Note that this is *meant for display only*. A file claiming to be a PDF might not be one, but this function would not let you know that. """ if filename is not None: # previously we checked if the extension was at most 3 characters # while I understand the motivation behind this, I don't think it # is a good idea to make long file extensions not work, just to # support files without an extension that have a `.` in the name _, sep, ext = filename.rpartition('.') ext = ext.lower() if sep else '' else: ext = guess_extension(content_type, strict=False) or '' return ext.strip('. ')
[docs] def get_image_size(image: Image.Image) -> tuple[str, str]: w, h = image.size return f'{w}px', f'{h}px'
[docs] def digest( fileobj: 'SupportsRead[bytes]', type: str = 'sha256', chunksize: int = 4096 ) -> str: if hasattr(fileobj, 'seek'): with suppress(UnsupportedOperation): fileobj.seek(0) digest = getattr(hashlib, type)() for chunk in iter(lambda: fileobj.read(chunksize), b''): digest.update(chunk) return digest.hexdigest()
[docs] def word_count(text: str) -> int: """ The word-count of the given text. Goes through the string exactly once and has constant memory usage. Not super sophisticated though. """ if not text: return 0 count = 0 inside_word = False for char in text: if char.isspace(): inside_word = False elif not inside_word: count += 1 inside_word = True return count
[docs] def name_without_extension(name: str) -> str: # previously we checked if the extension was at most 3 characters # while I understand the motivation behind this, I don't think it # is a good idea to make long file extensions not work, just to # support files without an extension that have a `.` in the name name, sep, ext = name.rpartition('.') # if there is no sep, then the original string will be in ext return name if sep else ext
@contextmanager
[docs] def current_dir(dir: 'StrOrBytesPath') -> 'Iterator[None]': previous = os.getcwd() os.chdir(dir) yield os.chdir(previous)
# we don't support *all* the image types PIL supports
[docs] EXCLUDED_IMAGE_TYPES = {'application/pdf'}
[docs] IMAGE_MIME_TYPES = get_supported_image_mime_types() - EXCLUDED_IMAGE_TYPES
[docs] IMAGE_MIME_TYPES_AND_SVG = IMAGE_MIME_TYPES | {'image/svg+xml'}