Source code for pdf.utils
from __future__ import annotations
from pdftotext import PDF # type:ignore
from onegov.pdf import log
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from _typeshed import SupportsRead
[docs]
def extract_pdf_info(
content: SupportsRead[bytes],
remove: str = '\0'
) -> tuple[int, str]:
""" Extracts the number of pages and text from a PDF.
Requires poppler.
"""
try:
content.seek(0) # type:ignore[attr-defined]
except Exception:
log.debug('Content does not support seek')
pages = PDF(content)
def clean(text: str) -> str:
for character in remove:
text = text.replace(character, '')
return ' '.join(text.split())
return (len(pages), ' '.join(clean(page) for page in pages).strip())