Source code for file.utils

from __future__ import annotations

import hashlib
import magic
import os

from contextlib import contextmanager, suppress
from depot.io.utils import FileIntent
from functools import lru_cache
from mimetypes import guess_extension
from io import IOBase, BytesIO, UnsupportedOperation
from lxml import etree
from PIL import Image


from typing import IO, TYPE_CHECKING
if TYPE_CHECKING:
    from _typeshed import SupportsRead, StrOrBytesPath
    from collections.abc import Iterator



[docs]
def content_type_from_fileobj(fileobj: SupportsRead[bytes]) -> str:
    """ Gets the content type from a file obj. Depot has this as well, but it
    doesn't use python-magic. We use python-magic to be slower, but more
    accurate.

    """

    if hasattr(fileobj, 'seek'):
        with suppress(UnsupportedOperation):
            fileobj.seek(0)

    return magic.from_buffer(fileobj.read(4096), mime=True)




[docs]
def as_fileintent(
    content: bytes | IO[bytes],
    filename: str | None
) -> FileIntent:

    # this is far stricter than filedepot is, but our custom UploadedFile
    # requires more than just a read() to be implemented, this check won't
    # make mypy happy, since IOBase does not inherit from typing.IO
    # the minimum we could get away with right now is SupportsReadCloseSeek
    # which is not that far off what IO provides, so for simplicity let's
    # just use that
    msg = 'Content must be either a bytes string or a file-like object.'
    assert isinstance(content, (bytes, IOBase)), msg

    if isinstance(content, bytes):
        return FileIntent(BytesIO(content), filename, 'text/plain')
    else:
        if hasattr(content, 'mode'):  # type: ignore[unreachable]
            assert 'b' in content.mode, 'Open file in binary mode.'

        if hasattr(content, 'seek'):
            content.seek(0)

        return FileIntent(
            content, filename, content_type_from_fileobj(content))



@lru_cache(maxsize=1)

[docs]
def get_supported_image_mime_types() -> set[str]:
    """ Queries PIL for *all* locally supported mime types.

    Adapted from:
    https://github.com/python-pillow/Pillow/issues/1182#issuecomment-90572583

    """

    # Make sure all supported formats are registered.
    Image.init()

    # Not all PIL formats register a mime type, fill in the blanks ourselves.
    supported_types = {
        'image/bmp',
        'image/x-bmp',
        'image/x-MS-bmp',
        'image/x-icon',
        'image/x-ico',
        'image/x-win-bitmap',
        'image/x-pcx',
        'image/x-portable-pixmap',
        'image/x-tga'
    }

    for mime in Image.MIME.values():

        # exclude pdfs, postscripts and the like
        if not mime.startswith('application/'):
            supported_types.add(mime)

    return supported_types



# TODO: technically SupportsReadClose is enough

[docs]
def get_svg_size(svg: IO[bytes]) -> tuple[str | None, str | None]:
    # note, the svg size may not be in pixel, it can include the same units
    # the browser uses for styling, so we need to pass this information down
    # to the browser, instead of using it internally
    root = etree.parse(svg).getroot()
    return root.get('width'), root.get('height')




[docs]
def extension_for_content_type(
    content_type: str,
    filename: str | None = None
) -> str:
    """ Gets the extension for the given content type. Note that this is
    *meant for display only*. A file claiming to be a PDF might not be one,
    but this function would not let you know that.

    """

    if filename is not None:
        # previously we checked if the extension was at most 3 characters
        # while I understand the motivation behind this, I don't think it
        # is a good idea to make long file extensions not work, just to
        # support files without an extension that have a `.` in the name
        _, sep, ext = filename.rpartition('.')
        ext = ext.lower() if sep else ''
    else:
        ext = guess_extension(content_type, strict=False) or ''

    return ext.strip('. ')




[docs]
def get_image_size(image: Image.Image) -> tuple[str, str]:
    w, h = image.size
    return f'{w}px', f'{h}px'




[docs]
def digest(
    fileobj: SupportsRead[bytes],
    type: str = 'sha256',
    chunksize: int = 4096
) -> str:

    if hasattr(fileobj, 'seek'):
        with suppress(UnsupportedOperation):
            fileobj.seek(0)

    digest = getattr(hashlib, type)()

    for chunk in iter(lambda: fileobj.read(chunksize), b''):
        digest.update(chunk)

    return digest.hexdigest()




[docs]
def word_count(text: str) -> int:
    """ The word-count of the given text. Goes through the string exactly
    once and has constant memory usage. Not super sophisticated though.

    """
    if not text:
        return 0

    count = 0
    inside_word = False

    for char in text:
        if char.isspace():
            inside_word = False
        elif not inside_word:
            count += 1
            inside_word = True

    return count




[docs]
def name_without_extension(name: str) -> str:
    # previously we checked if the extension was at most 3 characters
    # while I understand the motivation behind this, I don't think it
    # is a good idea to make long file extensions not work, just to
    # support files without an extension that have a `.` in the name
    name, sep, ext = name.rpartition('.')
    # if there is no sep, then the original string will be in ext
    return name if sep else ext



@contextmanager

[docs]
def current_dir(dir: StrOrBytesPath) -> Iterator[None]:
    previous = os.getcwd()
    os.chdir(dir)
    yield
    os.chdir(previous)



# we don't support *all* the image types PIL supports

[docs]
EXCLUDED_IMAGE_TYPES = {'application/pdf'}


[docs]
IMAGE_MIME_TYPES = get_supported_image_mime_types() - EXCLUDED_IMAGE_TYPES


[docs]
IMAGE_MIME_TYPES_AND_SVG = IMAGE_MIME_TYPES | {'image/svg+xml'}