Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

#!/usr/local/bin/python 

# encoding: utf-8 

""" 

*Print webpages to PDF* 

 

:Author: 

David Young 

 

:Date Created: 

September 28, 2015 

""" 

################# GLOBAL IMPORTS #################### 

import sys 

import os 

os.environ['TERM'] = 'vt100' 

import readline 

import glob 

import pickle 

import re 

import codecs 

from subprocess import Popen, PIPE, STDOUT 

from docopt import docopt 

from fundamentals import tools, times 

import codecs 

from fundamentals.files.tag import tag 

# SET ENCODE ERROR RETURN VALUE 

 

 

def handler(e): 

return (u' ', e.start + 1) 

codecs.register_error('dryx', handler) 

 

################################################################### 

# CLASSES # 

################################################################### 

 

 

class printpdf(): 

 

""" 

*PDF printer* 

 

**Key Arguments:** 

- ``log`` -- logger 

- ``settings`` -- the settings dictionary 

- ``url`` -- the webpage url 

- ``title`` -- title of pdf 

- ``folderpath`` -- path at which to save pdf 

- ``append`` -- append this at the end of the file name (not title) 

- ``readability`` -- clean text with Mercury Parser 

 

**Usage:** 

 

To print a webpage to PDF without any cleaning of the content using the title of the webpage as filename: 

 

.. code-block:: python  

 

from polyglot import printpdf 

pdf = printpdf( 

log=log, 

settings=settings, 

url="https://en.wikipedia.org/wiki/Volkswagen", 

folderpath="/path/to/output", 

readability=False 

).get() 

 

To give the PDF an alternative title use: 

 

.. code-block:: python  

 

from polyglot import printpdf 

pdf = printpdf( 

log=log, 

settings=settings, 

url="https://en.wikipedia.org/wiki/Volkswagen", 

folderpath="/path/to/output", 

title="Cars", 

readability=False 

).get() 

 

Or to append a string to the end of the filename before *.pdf* extension (useful for indexing or adding date created etc): 

 

.. code-block:: python  

 

from datetime import datetime, date, time 

now = datetime.now() 

now = now.strftime("%Y%m%dt%H%M%S") 

 

from polyglot import printpdf 

pdf = printpdf( 

log=log, 

settings=settings, 

url="https://en.wikipedia.org/wiki/Volkswagen", 

folderpath="/path/to/output", 

append="_"+now, 

readability=False 

).get() 

 

To clean the content using the Mercury Parser and apply some simple styling and pretty fonts: 

 

.. code-block:: python  

 

from polyglot import printpdf 

pdf = printpdf( 

log=log, 

settings=settings, 

url="https://en.wikipedia.org/wiki/Volkswagen", 

folderpath=pathToOutputDir, 

readability=True 

).get() 

 

""" 

# Initialisation 

 

def __init__( 

self, 

log, 

settings=False, 

url=False, 

title=False, 

folderpath=False, 

append=False, 

readability=True 

): 

self.log = log 

log.debug("instansiating a new 'print' object") 

self.settings = settings 

self.url = url 

self.folderpath = folderpath 

self.title = title 

self.append = append 

self.readability = readability 

# xt-self-arg-tmpx 

 

# INITIAL ACTIONS 

return None 

 

def get(self): 

""" 

*get the PDF* 

 

**Return:** 

- ``pdfPath`` -- the path to the generated PDF 

""" 

self.log.info('starting the ``get`` method') 

 

# APPEND TO FILENAME? 

if not self.append: 

self.append = "" 

 

if not self.readability: 

pdfPath = self._print_original_webpage() 

else: 

pdfPath = self._print_parsed_webpage() 

 

tag( 

log=self.log, 

filepath=pdfPath, 

tags="shit", 

rating=4, 

wherefrom=self.url 

) 

 

self.log.info('completed the ``get`` method') 

return pdfPath 

 

def _print_original_webpage( 

self): 

"""*print the original webpage* 

 

**Return:** 

- ``pdfPath`` -- the path to the generated PDF 

""" 

self.log.info('starting the ``_print_original_webpage`` method') 

 

if not self.title: 

from polyglot import htmlCleaner 

cleaner = htmlCleaner( 

log=self.log, 

settings=self.settings, 

url=self.url, 

outputDirectory=self.folderpath, 

title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, 

style=True, # add polyglot's styling to the HTML document 

# include metadata in generated HTML (e.g. title), 

metadata=True, 

h1=True # include title as H1 at the top of the doc 

) 

htmlFile = cleaner.clean() 

basename = os.path.basename(htmlFile) 

title = basename.replace(".html", "") 

os.remove(htmlFile) 

else: 

title = self.title 

 

# CONVERT TO PDF WITH ELECTON PDF 

 

url = self.url 

pdfPath = self.folderpath + "/" + title + self.append + ".pdf" 

electron = self.settings["executables"]["electron path"] 

cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals() 

p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) 

stdout, stderr = p.communicate() 

self.log.debug('output: %(stdout)s' % locals()) 

if len(stderr): 

print stderr 

 

exists = os.path.exists(pdfPath) 

if not exists: 

print "%(pdfPath)s was not generated for some reason - please investigate" % locals() 

sys.exit(0) 

 

self.log.info('completed the ``_print_original_webpage`` method') 

return pdfPath 

 

def _print_parsed_webpage( 

self): 

"""*print the parsed/cleaned webpage* 

 

**Return:** 

- ``pdfPath`` -- the path to the generated PDF 

""" 

self.log.info('starting the ``_print_parsed_webpage()`` method') 

 

from polyglot import htmlCleaner 

cleaner = htmlCleaner( 

log=self.log, 

settings=self.settings, 

url=self.url, 

outputDirectory=self.folderpath, 

title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, 

style=True, # add polyglot's styling to the HTML document 

metadata=True, # include metadata in generated HTML (e.g. title), 

h1=True # include title as H1 at the top of the doc 

) 

htmlFile = cleaner.clean() 

if not htmlFile: 

return 

 

pdfPath = htmlFile.replace(".html", self.append + ".pdf") 

 

# CONVERT TO PDF WITH ELECTON PDF 

electron = self.settings["executables"]["electron path"] 

cmd = """%(electron)s -i "%(htmlFile)s" -o "%(pdfPath)s" """ % locals() 

p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) 

stdout, stderr = p.communicate() 

if len(stderr): 

print stderr 

self.log.debug('output: %(stdout)s' % locals()) 

 

# REMOVE HTML FILE 

os.remove(htmlFile) 

 

exists = os.path.exists(pdfPath) 

if not exists: 

print "%(pdfPath)s was not generated for some reason - please investigate" % locals() 

sys.exit(0) 

 

self.log.info('completed the ``_print_parsed_webpage()`` method') 

return pdfPath 

 

 

if __name__ == '__main__': 

main()