Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
#!/usr/local/bin/python # encoding: utf-8 *Print webpages to PDF*
:Author: David Young
:Date Created: September 28, 2015 """ ################# GLOBAL IMPORTS #################### # SET ENCODE ERROR RETURN VALUE
return (u' ', e.start + 1)
################################################################### # CLASSES # ###################################################################
""" *PDF printer*
**Key Arguments:** - ``log`` -- logger - ``settings`` -- the settings dictionary - ``url`` -- the webpage url - ``title`` -- title of pdf - ``folderpath`` -- path at which to save pdf - ``append`` -- append this at the end of the file name (not title) - ``readability`` -- clean text with Mercury Parser
**Usage:**
To print a webpage to PDF without any cleaning of the content using the title of the webpage as filename:
.. code-block:: python
from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", readability=False ).get()
To give the PDF an alternative title use:
.. code-block:: python
from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", title="Cars", readability=False ).get()
Or to append a string to the end of the filename before *.pdf* extension (useful for indexing or adding date created etc):
.. code-block:: python
from datetime import datetime, date, time now = datetime.now() now = now.strftime("%Y%m%dt%H%M%S")
from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", append="_"+now, readability=False ).get()
To clean the content using the Mercury Parser and apply some simple styling and pretty fonts:
.. code-block:: python
from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath=pathToOutputDir, readability=True ).get()
""" # Initialisation
self, log, settings=False, url=False, title=False, folderpath=False, append=False, readability=True ): # xt-self-arg-tmpx
# INITIAL ACTIONS
""" *get the PDF*
**Return:** - ``pdfPath`` -- the path to the generated PDF """
# APPEND TO FILENAME?
pdfPath = self._print_original_webpage() else:
log=self.log, filepath=pdfPath, tags="shit", rating=4, wherefrom=self.url )
self): """*print the original webpage*
**Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.info('starting the ``_print_original_webpage`` method')
if not self.title: from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document # include metadata in generated HTML (e.g. title), metadata=True, h1=True # include title as H1 at the top of the doc ) htmlFile = cleaner.clean() basename = os.path.basename(htmlFile) title = basename.replace(".html", "") os.remove(htmlFile) else: title = self.title
# CONVERT TO PDF WITH ELECTON PDF
url = self.url pdfPath = self.folderpath + "/" + title + self.append + ".pdf" electron = self.settings["executables"]["electron path"] cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) if len(stderr): print stderr
exists = os.path.exists(pdfPath) if not exists: print "%(pdfPath)s was not generated for some reason - please investigate" % locals() sys.exit(0)
self.log.info('completed the ``_print_original_webpage`` method') return pdfPath
self): """*print the parsed/cleaned webpage*
**Return:** - ``pdfPath`` -- the path to the generated PDF """
log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=True # include title as H1 at the top of the doc ) return
# CONVERT TO PDF WITH ELECTON PDF
# REMOVE HTML FILE
print "%(pdfPath)s was not generated for some reason - please investigate" % locals() sys.exit(0)
main() |