diff --git a/README.md b/README.md index 0cd3124..9e053bb 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,10 @@ poetry run repub - [x] Offlines RSS feed xml - [x] Downloads media and enclosures - [x] Rewrites media urls -- [ ] Media compression +- [x] Image normalization (JPG, RGB) +- [x] Audio compression +- [ ] Image compression +- [ ] Video compression - [ ] Download and rewrite media embedded in content/CDATA fields - [ ] Config file to drive the program - [ ] Daemonize the program diff --git a/poetry.lock b/poetry.lock index 735c2e1..22c1f25 100644 --- a/poetry.lock +++ b/poetry.lock @@ -411,6 +411,23 @@ files = [ [package.dependencies] sgmllib3k = "*" +[[package]] +name = "ffmpeg-python" +version = "0.2.0" +description = "Python bindings for FFmpeg - with complex filtering support" +optional = false +python-versions = "*" +files = [ + {file = "ffmpeg-python-0.2.0.tar.gz", hash = "sha256:65225db34627c578ef0e11c8b1eb528bb35e024752f6f10b78c011f6f64c4127"}, + {file = "ffmpeg_python-0.2.0-py3-none-any.whl", hash = "sha256:ac441a0404e053f8b6a1113a77c0f452f1cfc62f6344a769475ffdc0f56c23c5"}, +] + +[package.dependencies] +future = "*" + +[package.extras] +dev = ["Sphinx (==2.1.0)", "future (==0.17.1)", "numpy (==1.16.4)", "pytest (==4.6.1)", "pytest-mock (==1.10.4)", "tox (==3.12.1)"] + [[package]] name = "filelock" version = "3.13.4" @@ -461,6 +478,17 @@ flake8 = ">=3" [package.extras] develop = ["build", "twine"] +[[package]] +name = "future" +version = "1.0.0" +description = "Clean single-source support for Python 3 and 2" +optional = false +python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "future-1.0.0-py3-none-any.whl", hash = "sha256:929292d34f5872e70396626ef385ec22355a1fae8ad29e1a734c3e43f9fbc216"}, + {file = "future-1.0.0.tar.gz", hash = "sha256:bd2968309307861edae1458a4f8a4f3598c03be43b97521076aebf5d94c07b05"}, +] + [[package]] name = "hyperlink" version = "21.0.0" @@ -1600,4 +1628,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "8b12b19145242fe86f09024453bca29792f6e22b4e63cfc72e2c6e480f38f043" +content-hash = "4c6b23f66fa6309a313c7a054e640ec31ce9207e5b3dd9301e06ae9b9fb44f79" diff --git a/pyproject.toml b/pyproject.toml index cc9cc4a..e1c7340 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ colorlog = "^6.8.2" feedparser = "^6.0.11" lxml = "^5.2.1" pillow = "^10.3.0" +ffmpeg-python = "^0.2.0" [build-system] diff --git a/repub/media.py b/repub/media.py new file mode 100644 index 0000000..9134804 --- /dev/null +++ b/repub/media.py @@ -0,0 +1,67 @@ +import logging +import math + +import ffmpeg + +logger = logging.getLogger(__name__) + + +def media_info(file_path): + return ffmpeg.probe(file_path) + + +def bitrate(info) -> float: + try: + return int(info["format"]["bit_rate"]) + except KeyError | ValueError: + return math.inf + + +def format(info): + try: + return info["format"]["format_name"] + except KeyError | ValueError: + return None + + +def compression_settings(input_file, settings): + info = media_info(input_file) + br = settings.get("REPUBLISHER_AUDIO_BITRATE", 96000) + fmt = settings.get("REPUBLISHER_AUDIO_FORMAT", "mp3") + if bitrate(info) <= br: + is_br = True + else: + is_br = False + if format(info) == fmt: + is_fmt = True + else: + is_fmt = False + + if is_br and is_fmt: + return None + + if is_br: + target_br = bitrate(info) + else: + target_br = br + return {"bitrate": target_br, "ext": "mp3"} + + +def compress_audio(input_file, output_file_base, settings): + ext = settings["ext"] + br = settings["bitrate"] + output_file = f"{output_file_base}.{ext}" + try: + logger.info(f"Compressing audio {input_file} to {output_file} target_br={br}") + out, _ = ( + ffmpeg.input(input_file) + .output( + output_file, + **{"b:a": f"{br}", "map": "0:a:0"}, + loglevel="quiet", + ) + .run() + ) + return output_file + except ffmpeg.Error as e: + raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e diff --git a/repub/pipelines.py b/repub/pipelines.py index ccfa57e..985f9c8 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,12 +1,20 @@ +import logging +import tempfile +from io import BytesIO from os import PathLike from pathlib import PurePosixPath -from typing import IO, DefaultDict, Optional, Set, Union +from typing import IO, DefaultDict, Dict, Optional, Set, Tuple, Union from urllib.parse import urlparse import repub.utils +from repub import media from repub.exporters import RssExporter -from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline +from scrapy.pipelines.files import FileException +from scrapy.pipelines.files import FilesPipeline as BaseFilesPipeline from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline +from scrapy.utils.misc import md5sum + +logger = logging.getLogger(__name__) class ImagePipeline(BaseImagesPipeline): @@ -32,6 +40,49 @@ class AudioPipeline(BaseFilesPipeline): def file_path(self, request, response=None, info=None, *, item=None): return repub.utils.local_audio_path(request.url) + def file_downloaded(self, response, request, info, *, item=None): + return self.audio_downloaded(response, request, info, item=item) + + def audio_downloaded(self, response, request, info, *, item=None): + checksum = None + for path, buf in self.get_audio(response, request, info, item=item): + if checksum is None: + buf.seek(0) + checksum = md5sum(buf) + # width, height = image.size + self.store.persist_file( + path, + buf, + info, + # meta={"width": width, "height": height}, + headers={"Content-Type": "audio/mp3"}, + ) + return checksum + + def get_audio(self, response, request, info, *, item=None): + path = self.file_path(request, response=response, info=info, item=item) + buf = BytesIO(response.body) + with tempfile.TemporaryDirectory() as tmpdir: + tmp_file = f"{tmpdir}/file" + converted_file_base = f"{tmpdir}/converted" + with open(tmp_file, "wb") as f: + f.write(buf.read()) + + s = media.compression_settings(tmp_file, {}) + if s is not None: + converted_file = media.compress_audio(tmp_file, converted_file_base, s) + buf_converted = BytesIO() + with open(converted_file, "rb") as f: + buf_converted.write(f.read()) + buf_converted.seek(0) + yield path, buf_converted + else: + logger.info( + f"Skipping audio compression for {path}, it meets requirements" + ) + buf.seek(0) + yield path, buf + class VideoPipeline(BaseFilesPipeline): def __init__(self, store_uri: Union[str, PathLike], **kwargs): diff --git a/shell.nix b/shell.nix index d416b8f..7e78f14 100644 --- a/shell.nix +++ b/shell.nix @@ -1,16 +1,26 @@ -{ system ? "x86_64-linux", pkgs ? import { inherit system; }, dev ? true, }: +{ + system ? "x86_64-linux", + pkgs ? import { inherit system; }, + dev ? true, +}: let pyCurrent = pkgs.python311; -poetryExtras = if dev then ["dev"] else []; + poetryExtras = if dev then [ "dev" ] else [ ]; poetryInstallExtras = ( - if poetryExtras == [] then "" - else pkgs.lib.concatStrings [ " --with=" (pkgs.lib.concatStringsSep "," poetryExtras) ] + if poetryExtras == [ ] then + "" + else + pkgs.lib.concatStrings [ + " --with=" + (pkgs.lib.concatStringsSep "," poetryExtras) + ] ); packages = [ - pyCurrent + pkgs.ffmpeg_5-headless + #(pyCurrent (ps: with ps; [ ffmpeg-python ])) pkgs.zsh - (pkgs.poetry.withPlugins(ps: with ps; [poetry-plugin-up])) + (pkgs.poetry.withPlugins (ps: with ps; [ poetry-plugin-up ])) ]; LD_LIBRARY_PATH = pkgs.lib.makeLibraryPath [ @@ -26,16 +36,16 @@ poetryExtras = if dev then ["dev"] else []; # Use python from path, so you can use a different version to the one bundled with poetry POETRY_VIRTUALENVS_PREFER_ACTIVE_PYTHON = "true"; in - pkgs.mkShell { - buildInputs = packages; - shellHook = '' - export SHELL=${pkgs.zsh} - export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" - export POETRY_VIRTUALENVS_IN_PROJECT="${POETRY_VIRTUALENVS_IN_PROJECT}" - export POETRY_VIRTUALENVS_PATH="${POETRY_VIRTUALENVS_PATH}" - export POETRY_VIRTUALENVS_PREFER_ACTIVE_PYTHON="${POETRY_VIRTUALENVS_PREFER_ACTIVE_PYTHON}" - export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring - poetry env use "${pyCurrent}/bin/python" - poetry install -vv --sync${poetryInstallExtras} - ''; - } +pkgs.mkShell { + buildInputs = packages; + shellHook = '' + export SHELL=${pkgs.zsh} + export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" + export POETRY_VIRTUALENVS_IN_PROJECT="${POETRY_VIRTUALENVS_IN_PROJECT}" + export POETRY_VIRTUALENVS_PATH="${POETRY_VIRTUALENVS_PATH}" + export POETRY_VIRTUALENVS_PREFER_ACTIVE_PYTHON="${POETRY_VIRTUALENVS_PREFER_ACTIVE_PYTHON}" + export PYTHON_KEYRING_BACKEND=keyring.backends.null.Keyring + poetry env use "${pyCurrent}/bin/python" + poetry install -vv --sync${poetryInstallExtras} + ''; +}