From dc4e79c130f1d1ed04f7c31c1edbb61ab3a8293f Mon Sep 17 00:00:00 2001 From: Abel Luck Date: Thu, 18 Apr 2024 15:27:00 +0200 Subject: [PATCH] implement media pipelines and url rewriting --- .gitignore | 1 + LICENSE.md | 660 ++++++++++++++++++++++++++++++++++++ README.md | 27 ++ poetry.lock | 96 +++++- pyproject.toml | 1 + repub/entrypoint.py | 102 ++++-- repub/exporters.py | 2 +- repub/items.py | 14 +- repub/pipelines.py | 113 ++---- repub/postprocessing.py | 11 - repub/rss.py | 2 +- repub/settings.py | 7 +- repub/spiders/rss_spider.py | 93 ++++- repub/utils.py | 74 ++++ 14 files changed, 1079 insertions(+), 124 deletions(-) create mode 100644 LICENSE.md create mode 100644 repub/utils.py diff --git a/.gitignore b/.gitignore index b359132..7e75396 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ out tmp/ /test*py data +logs diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..c6f01c6 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,660 @@ +# GNU AFFERO GENERAL PUBLIC LICENSE + +Version 3, 19 November 2007 + +Copyright (C) 2007 Free Software Foundation, Inc. + + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + +## Preamble + +The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + +The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains +free software for all its users. + +When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + +Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + +A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + +The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + +An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing +under this license. + +The precise terms and conditions for copying, distribution and +modification follow. + +## TERMS AND CONDITIONS + +### 0. Definitions. + +"This License" refers to version 3 of the GNU Affero General Public +License. + +"Copyright" also means copyright-like laws that apply to other kinds +of works, such as semiconductor masks. + +"The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + +To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of +an exact copy. The resulting work is called a "modified version" of +the earlier work or a work "based on" the earlier work. + +A "covered work" means either the unmodified Program or a work based +on the Program. + +To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + +To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user +through a computer network, with no transfer of a copy, is not +conveying. + +An interactive user interface displays "Appropriate Legal Notices" to +the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + +### 1. Source Code. + +The "source code" for a work means the preferred form of the work for +making modifications to it. "Object code" means any non-source form of +a work. + +A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + +The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + +The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + +The Corresponding Source need not include anything that users can +regenerate automatically from other parts of the Corresponding Source. + +The Corresponding Source for a work in source code form is that same +work. + +### 2. Basic Permissions. + +All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + +You may make, run and propagate covered works that you do not convey, +without conditions so long as your license otherwise remains in force. +You may convey covered works to others for the sole purpose of having +them make modifications exclusively for you, or provide you with +facilities for running those works, provided that you comply with the +terms of this License in conveying all material for which you do not +control copyright. Those thus making or running the covered works for +you must do so exclusively on your behalf, under your direction and +control, on terms that prohibit them from making any copies of your +copyrighted material outside their relationship with you. + +Conveying under any other circumstances is permitted solely under the +conditions stated below. Sublicensing is not allowed; section 10 makes +it unnecessary. + +### 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + +No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + +When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such +circumvention is effected by exercising rights under this License with +respect to the covered work, and you disclaim any intention to limit +operation or modification of the work as a means of enforcing, against +the work's users, your or third parties' legal rights to forbid +circumvention of technological measures. + +### 4. Conveying Verbatim Copies. + +You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + +You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + +### 5. Conveying Modified Source Versions. + +You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these +conditions: + +- a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. +- b) The work must carry prominent notices stating that it is + released under this License and any conditions added under + section 7. This requirement modifies the requirement in section 4 + to "keep intact all notices". +- c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. +- d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + +A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + +### 6. Conveying Non-Source Forms. + +You may convey a covered work in object code form under the terms of +sections 4 and 5, provided that you also convey the machine-readable +Corresponding Source under the terms of this License, in one of these +ways: + +- a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. +- b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the Corresponding + Source from a network server at no charge. +- c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. +- d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. +- e) Convey the object code using peer-to-peer transmission, + provided you inform other peers where the object code and + Corresponding Source of the work are being offered to the general + public at no charge under subsection 6d. + +A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + +A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, +family, or household purposes, or (2) anything designed or sold for +incorporation into a dwelling. In determining whether a product is a +consumer product, doubtful cases shall be resolved in favor of +coverage. For a particular product received by a particular user, +"normally used" refers to a typical or common use of that class of +product, regardless of the status of the particular user or of the way +in which the particular user actually uses, or expects or is expected +to use, the product. A product is a consumer product regardless of +whether the product has substantial commercial, industrial or +non-consumer uses, unless such uses represent the only significant +mode of use of the product. + +"Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to +install and execute modified versions of a covered work in that User +Product from a modified version of its Corresponding Source. The +information must suffice to ensure that the continued functioning of +the modified object code is in no case prevented or interfered with +solely because modification has been made. + +If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + +The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or +updates for a work that has been modified or installed by the +recipient, or for the User Product in which it has been modified or +installed. Access to a network may be denied when the modification +itself materially and adversely affects the operation of the network +or violates the rules and protocols for communication across the +network. + +Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + +### 7. Additional Terms. + +"Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + +When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + +Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders +of that material) supplement the terms of this License with terms: + +- a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or +- b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or +- c) Prohibiting misrepresentation of the origin of that material, + or requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or +- d) Limiting the use for publicity purposes of names of licensors + or authors of the material; or +- e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or +- f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions + of it) with contractual assumptions of liability to the recipient, + for any liability that these contractual assumptions directly + impose on those licensors and authors. + +All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + +If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + +Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; the +above requirements apply either way. + +### 8. Termination. + +You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + +However, if you cease all violation of this License, then your license +from a particular copyright holder is reinstated (a) provisionally, +unless and until the copyright holder explicitly and finally +terminates your license, and (b) permanently, if the copyright holder +fails to notify you of the violation by some reasonable means prior to +60 days after the cessation. + +Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + +Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + +### 9. Acceptance Not Required for Having Copies. + +You are not required to accept this License in order to receive or run +a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + +### 10. Automatic Licensing of Downstream Recipients. + +Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + +An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + +You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + +### 11. Patents. + +A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + +A contributor's "essential patent claims" are all patent claims owned +or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + +Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + +In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + +If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + +If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + +A patent license is "discriminatory" if it does not include within the +scope of its coverage, prohibits the exercise of, or is conditioned on +the non-exercise of one or more of the rights that are specifically +granted under this License. You may not convey a covered work if you +are a party to an arrangement with a third party that is in the +business of distributing software, under which you make payment to the +third party based on the extent of your activity of conveying the +work, and under which the third party grants, to any of the parties +who would receive the covered work from you, a discriminatory patent +license (a) in connection with copies of the covered work conveyed by +you (or copies made from those copies), or (b) primarily for and in +connection with specific products or compilations that contain the +covered work, unless you entered into that arrangement, or that patent +license was granted, prior to 28 March 2007. + +Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + +### 12. No Surrender of Others' Freedom. + +If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under +this License and any other pertinent obligations, then as a +consequence you may not convey it at all. For example, if you agree to +terms that obligate you to collect a royalty for further conveying +from those to whom you convey the Program, the only way you could +satisfy both those terms and this License would be to refrain entirely +from conveying the Program. + +### 13. Remote Network Interaction; Use with the GNU General Public License. + +Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your +version supports such interaction) an opportunity to receive the +Corresponding Source of your version by providing access to the +Corresponding Source from a network server at no charge, through some +standard or customary means of facilitating copying of software. This +Corresponding Source shall include the Corresponding Source for any +work covered by version 3 of the GNU General Public License that is +incorporated pursuant to the following paragraph. + +Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + +### 14. Revised Versions of this License. + +The Free Software Foundation may publish revised and/or new versions +of the GNU Affero General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever +published by the Free Software Foundation. + +If the Program specifies that a proxy can decide which future versions +of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + +Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + +### 15. Disclaimer of Warranty. + +THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT +WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND +PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE +DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, REPAIR OR +CORRECTION. + +### 16. Limitation of Liability. + +IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR +CONVEYS THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES +ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT +NOT LIMITED TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR +LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM +TO OPERATE WITH ANY OTHER PROGRAMS), EVEN IF SUCH HOLDER OR OTHER +PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. + +### 17. Interpretation of Sections 15 and 16. + +If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + +END OF TERMS AND CONDITIONS + +## How to Apply These Terms to Your New Programs + +If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these +terms. + +To do so, attach the following notices to the program. It is safest to +attach them to the start of each source file to most effectively state +the exclusion of warranty; and each file should have at least the +"copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper +mail. + +If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for +the specific requirements. + +You should also get your employer (if you work as a programmer) or +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. For more information on this, and how to apply and follow +the GNU AGPL, see . diff --git a/README.md b/README.md index e69de29..3d0a0bc 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,27 @@ +# republisher-redux + +``` shell +mkdir logs out +poetry install +poetry run repub +``` + + +## License + +republisher-redux, a tool to mirror RSS/ATOM feeds completely offline + +Copyright (C) 2024 Abel Luck + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . diff --git a/poetry.lock b/poetry.lock index 80da2e0..735c2e1 100644 --- a/poetry.lock +++ b/poetry.lock @@ -539,13 +539,13 @@ files = [ [[package]] name = "itemloaders" -version = "1.1.0" +version = "1.2.0" description = "Base library for scrapy's ItemLoader" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "itemloaders-1.1.0-py3-none-any.whl", hash = "sha256:c8c82fe0c11fc4cdd08ec04df0b3c43f3cb7190002edb517e02d55de8efc2aeb"}, - {file = "itemloaders-1.1.0.tar.gz", hash = "sha256:21d81c61da6a08b48e5996288cdf3031c0f92e5d0075920a0242527523e14a48"}, + {file = "itemloaders-1.2.0-py3-none-any.whl", hash = "sha256:6ec5753dafdc69262774694c78c9ec44605672586b40a7134a097a5df601a442"}, + {file = "itemloaders-1.2.0.tar.gz", hash = "sha256:fc2307f984116b010d6101a68a6a133ac8de927320b0ab696f31ad710a8d8d98"}, ] [package.dependencies] @@ -890,6 +890,92 @@ files = [ {file = "pbr-6.0.0.tar.gz", hash = "sha256:d1377122a5a00e2f940ee482999518efe16d745d423a670c27773dfbc3c9a7d9"}, ] +[[package]] +name = "pillow" +version = "10.3.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:90b9e29824800e90c84e4022dd5cc16eb2d9605ee13f05d47641eb183cd73d45"}, + {file = "pillow-10.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a2c405445c79c3f5a124573a051062300936b0281fee57637e706453e452746c"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:78618cdbccaa74d3f88d0ad6cb8ac3007f1a6fa5c6f19af64b55ca170bfa1edf"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:261ddb7ca91fcf71757979534fb4c128448b5b4c55cb6152d280312062f69599"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:ce49c67f4ea0609933d01c0731b34b8695a7a748d6c8d186f95e7d085d2fe475"}, + {file = "pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:b14f16f94cbc61215115b9b1236f9c18403c15dd3c52cf629072afa9d54c1cbf"}, + {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d33891be6df59d93df4d846640f0e46f1a807339f09e79a8040bc887bdcd7ed3"}, + {file = "pillow-10.3.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b50811d664d392f02f7761621303eba9d1b056fb1868c8cdf4231279645c25f5"}, + {file = "pillow-10.3.0-cp310-cp310-win32.whl", hash = "sha256:ca2870d5d10d8726a27396d3ca4cf7976cec0f3cb706debe88e3a5bd4610f7d2"}, + {file = "pillow-10.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:f0d0591a0aeaefdaf9a5e545e7485f89910c977087e7de2b6c388aec32011e9f"}, + {file = "pillow-10.3.0-cp310-cp310-win_arm64.whl", hash = "sha256:ccce24b7ad89adb5a1e34a6ba96ac2530046763912806ad4c247356a8f33a67b"}, + {file = "pillow-10.3.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:5f77cf66e96ae734717d341c145c5949c63180842a545c47a0ce7ae52ca83795"}, + {file = "pillow-10.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e4b878386c4bf293578b48fc570b84ecfe477d3b77ba39a6e87150af77f40c57"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fdcbb4068117dfd9ce0138d068ac512843c52295ed996ae6dd1faf537b6dbc27"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9797a6c8fe16f25749b371c02e2ade0efb51155e767a971c61734b1bf6293994"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:9e91179a242bbc99be65e139e30690e081fe6cb91a8e77faf4c409653de39451"}, + {file = "pillow-10.3.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:1b87bd9d81d179bd8ab871603bd80d8645729939f90b71e62914e816a76fc6bd"}, + {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:81d09caa7b27ef4e61cb7d8fbf1714f5aec1c6b6c5270ee53504981e6e9121ad"}, + {file = "pillow-10.3.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:048ad577748b9fa4a99a0548c64f2cb8d672d5bf2e643a739ac8faff1164238c"}, + {file = "pillow-10.3.0-cp311-cp311-win32.whl", hash = "sha256:7161ec49ef0800947dc5570f86568a7bb36fa97dd09e9827dc02b718c5643f09"}, + {file = "pillow-10.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:8eb0908e954d093b02a543dc963984d6e99ad2b5e36503d8a0aaf040505f747d"}, + {file = "pillow-10.3.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e6f7d1c414191c1199f8996d3f2282b9ebea0945693fb67392c75a3a320941f"}, + {file = "pillow-10.3.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:e46f38133e5a060d46bd630faa4d9fa0202377495df1f068a8299fd78c84de84"}, + {file = "pillow-10.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:50b8eae8f7334ec826d6eeffaeeb00e36b5e24aa0b9df322c247539714c6df19"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9d3bea1c75f8c53ee4d505c3e67d8c158ad4df0d83170605b50b64025917f338"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:19aeb96d43902f0a783946a0a87dbdad5c84c936025b8419da0a0cd7724356b1"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:74d28c17412d9caa1066f7a31df8403ec23d5268ba46cd0ad2c50fb82ae40462"}, + {file = "pillow-10.3.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:ff61bfd9253c3915e6d41c651d5f962da23eda633cf02262990094a18a55371a"}, + {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d886f5d353333b4771d21267c7ecc75b710f1a73d72d03ca06df49b09015a9ef"}, + {file = "pillow-10.3.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4b5ec25d8b17217d635f8935dbc1b9aa5907962fae29dff220f2659487891cd3"}, + {file = "pillow-10.3.0-cp312-cp312-win32.whl", hash = "sha256:51243f1ed5161b9945011a7360e997729776f6e5d7005ba0c6879267d4c5139d"}, + {file = "pillow-10.3.0-cp312-cp312-win_amd64.whl", hash = "sha256:412444afb8c4c7a6cc11a47dade32982439925537e483be7c0ae0cf96c4f6a0b"}, + {file = "pillow-10.3.0-cp312-cp312-win_arm64.whl", hash = "sha256:798232c92e7665fe82ac085f9d8e8ca98826f8e27859d9a96b41d519ecd2e49a"}, + {file = "pillow-10.3.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:4eaa22f0d22b1a7e93ff0a596d57fdede2e550aecffb5a1ef1106aaece48e96b"}, + {file = "pillow-10.3.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cd5e14fbf22a87321b24c88669aad3a51ec052eb145315b3da3b7e3cc105b9a2"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1530e8f3a4b965eb6a7785cf17a426c779333eb62c9a7d1bbcf3ffd5bf77a4aa"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5d512aafa1d32efa014fa041d38868fda85028e3f930a96f85d49c7d8ddc0383"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:339894035d0ede518b16073bdc2feef4c991ee991a29774b33e515f1d308e08d"}, + {file = "pillow-10.3.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:aa7e402ce11f0885305bfb6afb3434b3cd8f53b563ac065452d9d5654c7b86fd"}, + {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:0ea2a783a2bdf2a561808fe4a7a12e9aa3799b701ba305de596bc48b8bdfce9d"}, + {file = "pillow-10.3.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:c78e1b00a87ce43bb37642c0812315b411e856a905d58d597750eb79802aaaa3"}, + {file = "pillow-10.3.0-cp38-cp38-win32.whl", hash = "sha256:72d622d262e463dfb7595202d229f5f3ab4b852289a1cd09650362db23b9eb0b"}, + {file = "pillow-10.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:2034f6759a722da3a3dbd91a81148cf884e91d1b747992ca288ab88c1de15999"}, + {file = "pillow-10.3.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:2ed854e716a89b1afcedea551cd85f2eb2a807613752ab997b9974aaa0d56936"}, + {file = "pillow-10.3.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:dc1a390a82755a8c26c9964d457d4c9cbec5405896cba94cf51f36ea0d855002"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4203efca580f0dd6f882ca211f923168548f7ba334c189e9eab1178ab840bf60"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3102045a10945173d38336f6e71a8dc71bcaeed55c3123ad4af82c52807b9375"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6fb1b30043271ec92dc65f6d9f0b7a830c210b8a96423074b15c7bc999975f57"}, + {file = "pillow-10.3.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:1dfc94946bc60ea375cc39cff0b8da6c7e5f8fcdc1d946beb8da5c216156ddd8"}, + {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b09b86b27a064c9624d0a6c54da01c1beaf5b6cadfa609cf63789b1d08a797b9"}, + {file = "pillow-10.3.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d3b2348a78bc939b4fed6552abfd2e7988e0f81443ef3911a4b8498ca084f6eb"}, + {file = "pillow-10.3.0-cp39-cp39-win32.whl", hash = "sha256:45ebc7b45406febf07fef35d856f0293a92e7417ae7933207e90bf9090b70572"}, + {file = "pillow-10.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:0ba26351b137ca4e0db0342d5d00d2e355eb29372c05afd544ebf47c0956ffeb"}, + {file = "pillow-10.3.0-cp39-cp39-win_arm64.whl", hash = "sha256:50fd3f6b26e3441ae07b7c979309638b72abc1a25da31a81a7fbd9495713ef4f"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:6b02471b72526ab8a18c39cb7967b72d194ec53c1fd0a70b050565a0f366d355"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8ab74c06ffdab957d7670c2a5a6e1a70181cd10b727cd788c4dd9005b6a8acd9"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:048eeade4c33fdf7e08da40ef402e748df113fd0b4584e32c4af74fe78baaeb2"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2ec1e921fd07c7cda7962bad283acc2f2a9ccc1b971ee4b216b75fad6f0463"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c8e73e99da7db1b4cad7f8d682cf6abad7844da39834c288fbfa394a47bbced"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:16563993329b79513f59142a6b02055e10514c1a8e86dca8b48a893e33cf91e3"}, + {file = "pillow-10.3.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:dd78700f5788ae180b5ee8902c6aea5a5726bac7c364b202b4b3e3ba2d293170"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:aff76a55a8aa8364d25400a210a65ff59d0168e0b4285ba6bf2bd83cf675ba32"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:b7bc2176354defba3edc2b9a777744462da2f8e921fbaf61e52acb95bafa9828"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:793b4e24db2e8742ca6423d3fde8396db336698c55cd34b660663ee9e45ed37f"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d93480005693d247f8346bc8ee28c72a2191bdf1f6b5db469c096c0c867ac015"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c83341b89884e2b2e55886e8fbbf37c3fa5efd6c8907124aeb72f285ae5696e5"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1a1d1915db1a4fdb2754b9de292642a39a7fb28f1736699527bb649484fb966a"}, + {file = "pillow-10.3.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a0eaa93d054751ee9964afa21c06247779b90440ca41d184aeb5d410f20ff591"}, + {file = "pillow-10.3.0.tar.gz", hash = "sha256:9d2455fbf44c914840c793e89aa82d0e1763a14253a000743719ae5946814b2d"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] + [[package]] name = "platformdirs" version = "4.2.0" @@ -1514,4 +1600,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.11" -content-hash = "c7cada0d348ebdcb48a3468d0b45aa8509b57ab3cd4d3c4065421bb0c0f1f57b" +content-hash = "8b12b19145242fe86f09024453bca29792f6e22b4e63cfc72e2c6e480f38f043" diff --git a/pyproject.toml b/pyproject.toml index f819538..cc9cc4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,6 +16,7 @@ python-dateutil = "^2.9.0.post0" colorlog = "^6.8.2" feedparser = "^6.0.11" lxml = "^5.2.1" +pillow = "^10.3.0" [build-system] diff --git a/repub/entrypoint.py b/repub/entrypoint.py index 71205cb..7a6357d 100644 --- a/repub/entrypoint.py +++ b/repub/entrypoint.py @@ -1,30 +1,88 @@ -from scrapy.crawler import CrawlerProcess -from scrapy.utils.project import get_project_settings +import logging +import multiprocessing as mp +import multiprocessing.connection as mpc -from . import colorlog -from .postprocessing import SortRssItems -from .spiders.rss_spider import RssFeedSpider - -base_settings = get_project_settings() - -settings = { - **base_settings, - "FEEDS": { - "out/feed.rss": { - "format": "rss", - "postprocessing": [], - }, - }, +feeds = { + "gp-pod": {"url": "https://guardianproject.info/podcast/podcast.xml"}, + "nasa": {"url": "https://www.nasa.gov/rss/dyn/breaking_news.rss"}, } -colorlog.load_colorlog() +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +ch = logging.StreamHandler() +ch.setLevel(logging.DEBUG) +formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") +ch.setFormatter(formatter) +logger.addHandler(ch) -urls = ["https://www.nasa.gov/rss/dyn/breaking_news.rss"] +class FeedNameFilter: + def __init__(self, feed_options): + self.feed_options = feed_options + + def accepts(self, item): + return item.feed_name == self.feed_options["feed_name"] + + +def execute_spider(queue, name, url): + from scrapy.crawler import CrawlerProcess + from scrapy.settings import Settings + from scrapy.utils.project import get_project_settings + + from .spiders.rss_spider import RssFeedSpider + + try: + settings: Settings = { + **get_project_settings(), + "REPUBLISHER_OUT_DIR": "out", + "FEEDS": { + f"out/{name}.rss": { + "format": "rss", + "postprocessing": [], + # "item_filter": FeedNameFilter, + "feed_name": name, + } + }, + "ITEM_PIPELINES": { + "repub.pipelines.ImagePipeline": 1, + "repub.pipelines.AudioPipeline": 2, + "repub.pipelines.VideoPipeline": 3, + "repub.pipelines.FilePipeline": 4, + }, + "LOG_FILE": f"logs/{name}.log", + "REPUBLISHER_IMAGE_DIR": "images", + "REPUBLISHER_VIDEO_DIR": "video", + "REPUBLISHER_AUDIO_DIR": "audio", + "REPUBLISHER_FILE_DIR": "files", + "IMAGES_STORE": f"out/{name}/images", + "AUDIO_STORE": f"out/{name}/audio", + "VIDEO_STORE": f"out/{name}/images", + "FILES_STORE": f"out/{name}/files", + } + process = CrawlerProcess(settings) + # colorlog.load_colorlog() + process.crawl(RssFeedSpider, feed_name=name, urls=[url]) + process.start() + queue.put(None) + except Exception as e: + queue.put(e) def entrypoint(): - process = CrawlerProcess(settings) - - process.crawl(RssFeedSpider, urls=urls) - process.start() # the script will block here until the crawling is finished + pool = [] + for name, data in feeds.items(): + logger.info(f"Starting feed {name}") + queue = mp.Queue() + process = mp.Process(target=execute_spider, args=(queue, name, data["url"])) + pool.append((name, process, queue)) + for n, proc, q in pool: + proc.start() + mpc.wait(p.sentinel for n, p, q in pool) + for name, p, q in pool: + result = q.get() + if result is not None: + print() + logger.error(f"Feed {name} encountered error") + logger.critical(result, exc_info=True) + else: + logger.info(f"Feed {name} completed successfully") diff --git a/repub/exporters.py b/repub/exporters.py index ee28b53..21d6fcb 100644 --- a/repub/exporters.py +++ b/repub/exporters.py @@ -28,7 +28,7 @@ class RssExporter(BaseItemExporter): self.flush_buffer() return - if not self.channel: + if self.channel is None: self.item_buffer.append(item) else: self.export_rss_item(item) diff --git a/repub/items.py b/repub/items.py index 4cb36f2..748858e 100644 --- a/repub/items.py +++ b/repub/items.py @@ -1,12 +1,24 @@ from dataclasses import dataclass -from typing import Any +from typing import Any, List @dataclass class ElementItem: + feed_name: str el: Any + image_urls: List[str] + images: List[Any] + file_urls: List[str] + files: List[Any] + audio_urls: List[str] + audios: List[Any] + video_urls: List[str] + videos: List[Any] @dataclass class ChannelElementItem: + feed_name: str el: Any + image_urls: List[str] + images: List[Any] diff --git a/repub/pipelines.py b/repub/pipelines.py index 9b9d0f2..ccfa57e 100644 --- a/repub/pipelines.py +++ b/repub/pipelines.py @@ -1,83 +1,44 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +from os import PathLike +from pathlib import PurePosixPath +from typing import IO, DefaultDict, Optional, Set, Union +from urllib.parse import urlparse + +import repub.utils +from repub.exporters import RssExporter +from scrapy.pipelines.images import FilesPipeline as BaseFilesPipeline +from scrapy.pipelines.images import ImagesPipeline as BaseImagesPipeline -# useful for handling different item types with a single interface -# from itemadapter import ItemAdapter -import six -from scrapy import signals -from scrapy.exceptions import CloseSpider, NotConfigured -from scrapy.utils.misc import load_object +class ImagePipeline(BaseImagesPipeline): + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.local_image_path(request.url) -from .exporters import RssItemExporter -from .items import RssItem -from .signals import feed_channel_discovered + def thumb_path(self, request, thumb_id, response=None, info=None, *, item=None): + raise NotImplementedError() -class RssExportPipeline(object): - def __init__(self): - self.files = {} - self.exporters = {} - - @classmethod - def from_crawler(cls, crawler): - pipeline = cls() - crawler.signals.connect(pipeline.spider_closed, signals.spider_closed) - crawler.signals.connect( - pipeline.feed_channel_discovered, feed_channel_discovered - ) - return pipeline - - def feed_channel_discovered(self, spider, feed, channel): - try: - file = open(spider.settings.get("FEED_FILE"), "wb") - except TypeError: - raise NotConfigured("FEED_FILE parameter does not string or does not exist") - except (IOError, OSError) as e: - raise CloseSpider( - "Cannot open file {}: {}".format( - spider.settings.get("FEED_FILE", None), e - ) - ) - self.files[spider] = file - - item_cls = spider.settings.get( - "FEED_ITEM_CLASS", spider.settings.get("FEED_ITEM_CLS", RssItem) - ) - if isinstance(item_cls, six.string_types): - item_cls = load_object(item_cls) - - namespaces = spider.settings.get("FEED_NAMESPACES", {}) - - feed_exporter = spider.settings.get("FEED_EXPORTER", RssItemExporter) - if isinstance(feed_exporter, six.string_types): - feed_exporter = load_object(feed_exporter) - if not issubclass(feed_exporter, RssItemExporter): - raise TypeError( - "FEED_EXPORTER must be RssItemExporter or its subclass, not '{}'".format( - feed_exporter - ) - ) - self.exporters[spider] = feed_exporter( - file, - channel, - namespaces=namespaces, - item_cls=item_cls, - ) - self.exporters[spider].start_exporting() - - def spider_closed(self, spider): - self.exporters[spider].finish_exporting() - file = self.files.pop(spider) - file.close() - - def process_item(self, item, spider): - self.exporters[spider].export_item(item) - return item +class FilePipeline(BaseFilesPipeline): + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.local_file_path(request.url) -class RepubPipeline: - def process_item(self, item, spider): - return item +class AudioPipeline(BaseFilesPipeline): + def __init__(self, store_uri: Union[str, PathLike], **kwargs): + self.FILES_URLS_FIELD = "audio_urls" + self.FILES_RESULT_FIELD = "audios" + store_uri = kwargs["settings"]["AUDIO_STORE"] + super().__init__(store_uri, **kwargs) + + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.local_audio_path(request.url) + + +class VideoPipeline(BaseFilesPipeline): + def __init__(self, store_uri: Union[str, PathLike], **kwargs): + self.FILES_URLS_FIELD = "video_urls" + self.FILES_RESULT_FIELD = "videos" + store_uri = kwargs["settings"]["VIDEO_STORE"] + super().__init__(store_uri, **kwargs) + + def file_path(self, request, response=None, info=None, *, item=None): + return repub.utils.local_video_path(request.url) diff --git a/repub/postprocessing.py b/repub/postprocessing.py index 8b8b3c1..e69de29 100644 --- a/repub/postprocessing.py +++ b/repub/postprocessing.py @@ -1,11 +0,0 @@ -class SortRssItems: - def __init__(self, file, feed_options): - self.file = file - self.feed_options = feed_options - self.buffer = "" - - def write(self, data): - self.buffer += data.decode("utf-8") - - def close(self): - self.file.write(sorted) diff --git a/repub/rss.py b/repub/rss.py index 0e0af5c..0a6ab18 100644 --- a/repub/rss.py +++ b/repub/rss.py @@ -78,7 +78,7 @@ def sort_rss(root): def serialize(root): - root = sort_rss(root) + # root = sort_rss(root) return etree.tostring( root, encoding="utf-8", xml_declaration=True, pretty_print=True ) diff --git a/repub/settings.py b/repub/settings.py index b6e3f5e..076ed7e 100644 --- a/repub/settings.py +++ b/repub/settings.py @@ -93,4 +93,9 @@ FEED_EXPORTERS = { "rss": "repub.exporters.RssExporter", } -LOG_LEVEL = "ERROR" +TELNETCONSOLE_ENABLED = False + +LOG_LEVEL = "INFO" +# LOG_LEVEL = "ERROR" + +MEDIA_ALLOW_REDIRECTS = True diff --git a/repub/spiders/rss_spider.py b/repub/spiders/rss_spider.py index 00ffd83..ab4b7b9 100644 --- a/repub/spiders/rss_spider.py +++ b/repub/spiders/rss_spider.py @@ -3,6 +3,8 @@ import logging import feedparser from repub.items import ChannelElementItem, ElementItem from repub.rss import CDATA, CONTENT, ITUNES, MEDIA, E, normalize_date +from repub.utils import FileType, determine_file_type, local_file_path, local_image_path +from scrapy.crawler import Crawler from scrapy.spiders import Spider from scrapy.utils.spider import iterate_spider_output @@ -13,6 +15,34 @@ class BaseRssFeedSpider(Spider): from RSS feeds. """ + def __init__(self, feed_name, **kwargs): + super().__init__(**kwargs) + self.feed_name = feed_name + + def _set_crawler(self, crawler: Crawler) -> None: + super()._set_crawler(crawler) + for s in [ + "REPUBLISHER_IMAGE_DIR", + "REPUBLISHER_FILE_DIR", + "REPUBLISHER_AUDIO_DIR", + "REPUBLISHER_VIDEO_DIR", + ]: + if self.settings.get(s) is None: + raise RuntimeError(f"Missing setting: {s}") + + def rewrite_file_url(self, file_type: FileType, url): + file_dir = self.settings["REPUBLISHER_FILE_DIR"] + if file_type == FileType.IMAGE: + file_dir = self.settings["REPUBLISHER_IMAGE_DIR"] + elif file_type == FileType.VIDEO: + file_dir = self.settings["REPUBLISHER_VIDEO_DIR"] + elif file_type == FileType.AUDIO: + file_dir = self.settings["REPUBLISHER_AUDIO_DIR"] + return f"/{file_dir}/{local_file_path(url)}" + + def rewrite_image_url(self, url): + return self.rewrite_file_url(FileType.IMAGE, url) + def parse_feed(self, feed_text): parsed = feedparser.parse(feed_text, sanitize_html=False) if parsed.bozo: @@ -48,25 +78,30 @@ class BaseRssFeedSpider(Spider): for tag in f.get("tags", []): channel.append(E.category(tag.term)) + image_urls = [] if "image" in f: if "href" in f.image: image = E.image( E.title(f.get("title")), E.link(f.get("link")), - E.url(f.image.get("href")), + E.url(self.rewrite_image_url(f.image.get("href"))), E.description(f.get("description")), ) + image_urls.append(f.image.get("href")) else: image = E.image( E.title(f.image.get("title")), E.link(f.image.get("link")), - E.url(f.image.get("url")), + E.url(self.rewrite_image_url(f.image.get("url"))), E.description(f.image.get("description")), E.width(f.image.get("width")), E.height(f.image.get("height")), ) + image_urls.append(f.image.get("url")) channel.append(image) - return ChannelElementItem(el=channel) + return ChannelElementItem( + feed_name=self.feed_name, el=channel, image_urls=image_urls, images=[] + ) def _parse(self, response, **kwargs): response = self.adapt_response(response) @@ -113,6 +148,21 @@ class RssFeedSpider(BaseRssFeedSpider): super().__init__(**kwargs) def parse_entry(self, response, feed, entry): + image_urls = [] + file_urls = [] + audio_urls = [] + video_urls = [] + + def add_url(file_type, url): + if file_type == FileType.IMAGE: + image_urls.append(url) + elif file_type == FileType.AUDIO: + audio_urls.append(url) + elif file_type == FileType.VIDEO: + video_urls.append(url) + elif file_type == FileType.FILE: + file_urls.append(url) + item = E.item( E.title(entry.get("title")), E.link(entry.get("link")), @@ -125,15 +175,29 @@ class RssFeedSpider(BaseRssFeedSpider): E.author(entry.get("author")), ITUNES.summary(entry.get("summary")), ITUNES.duration(entry.get("itunes_duration")), + ITUNES.image( + None, + ( + {"href": self.rewrite_image_url(entry.get("image").href)} + if "image" in entry + else None + ), + ), ) + if entry.get("image"): + image_urls.append(entry.get("image").href) for enc in entry.enclosures: + file_type = determine_file_type( + url=enc.get("href"), mimetype=enc.get("type") + ) item.append( E.enclosure( - E.url(enc.get("href")), + E.url(self.rewrite_file_url(file_type, enc.get("href"))), E.length(enc.get("length")), E.type(enc.get("type")), ) ) + add_url(file_type, enc.get("href")) if "content" in entry: for c in entry.content: @@ -144,9 +208,14 @@ class RssFeedSpider(BaseRssFeedSpider): for media in ( media for media in entry["media_content"] if media.get("url") ): + file_type = determine_file_type( + url=media.get("url"), + medium=media.get("medium"), + mimetype=media.get("type"), + ) item.append( MEDIA.content( - E.url(media.get("url")), + E.url(self.rewrite_file_url(file_type, media.get("url"))), E.type(media.get("type")), E.medium(media.get("medium")), E.isDefault(media.get("isDefault")), @@ -161,4 +230,16 @@ class RssFeedSpider(BaseRssFeedSpider): E.lang(media.get("lang")), ) ) - return ElementItem(el=item) + add_url(file_type, media.get("url")) + return ElementItem( + feed_name=self.feed_name, + el=item, + images=[], + image_urls=image_urls, + files=[], + file_urls=file_urls, + audio_urls=audio_urls, + audios=[], + video_urls=video_urls, + videos=[], + ) diff --git a/repub/utils.py b/repub/utils.py new file mode 100644 index 0000000..6c827c7 --- /dev/null +++ b/repub/utils.py @@ -0,0 +1,74 @@ +import hashlib +import mimetypes +from enum import Enum +from pathlib import Path +from typing import Any, List, Optional + +from scrapy.utils.python import to_bytes + + +class FileType(Enum): + """File types that the republisher can handle""" + + VIDEO = "video" + IMAGE = "image" + AUDIO = "audio" + FILE = "file" + + +def local_image_path(name: str) -> str: + image_guid = hashlib.sha1(to_bytes(name)).hexdigest() # nosec + return f"full/{image_guid}.jpg" + + +def local_file_path(s: str) -> str: + media_guid = hashlib.sha1(to_bytes(s)).hexdigest() # nosec + media_ext = Path(s).suffix + # Handles empty and wild extensions by trying to guess the + # mime type then extension or default to empty string otherwise + if media_ext not in mimetypes.types_map: + media_ext = "" + media_type = mimetypes.guess_type(s)[0] + if media_type: + media_ext = mimetypes.guess_extension(media_type) + return f"{media_guid}{media_ext}" + + +def local_video_path(s: str) -> str: + return local_file_path(s) + + +def local_audio_path(s: str) -> str: + return local_file_path(s) + + +def determine_file_type( + url: str, medium: Optional[str] = None, mimetype: Optional[str] = None +): + """ + Uses all available information to determine the type of a file from a path/url + """ + if medium: + if medium == "video": + return FileType.VIDEO + if medium == "audio": + return FileType.AUDIO + if medium == "image": + return FileType.IMAGE + if medium == "document": + return FileType.FILE + if medium == "executable": + return FileType.FILE + + if not mimetype: + mimetype = mimetypes.guess_type(url)[0] + + if mimetype: + if mimetype.startswith("image"): + return FileType.IMAGE + if mimetype.startswith("audio"): + return FileType.AUDIO + if mimetype.startswith("video"): + return FileType.VIDEO + + return FileType.FILE