Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

#!/usr/local/bin/python 

# encoding: utf-8 

""" 

*using the Mercury Parser API to clean up a local html file* 

 

:Author: 

David Young 

 

:Date Created: 

October 1, 2016 

""" 

################# GLOBAL IMPORTS #################### 

import sys 

import os 

import codecs 

import re 

os.environ['TERM'] = 'vt100' 

from fundamentals import tools 

import requests 

 

 

class htmlCleaner(): 

""" 

*A parser/cleaner to strip a webpage article of all cruft and neatly present it with some nice css* 

 

**Key Arguments:** 

- ``log`` -- logger 

- ``settings`` -- the settings dictionary 

- ``url`` -- the URL to the HTML page to parse and clean 

- ``outputDirectory`` -- path to the directory to save the output html file to 

- ``title`` -- title of the document to save. If *False* will take the title of the HTML page as the filename. Default *False*. 

- ``style`` -- add polyglot's styling to the HTML document. Default *True* 

- ``metadata`` -- include metadata in generated HTML. Default *True* 

- ``h1`` -- include title as H1 at the top of the doc. Default *True* 

 

**Usage:** 

 

To generate the HTML page, using the title of the webpage as the filename: 

 

.. code-block:: python  

 

from polyglot import htmlCleaner 

cleaner = htmlCleaner( 

log=log, 

settings=settings, 

url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", 

outputDirectory="/tmp" 

) 

cleaner.clean()  

 

Or specify the title of the document and remove styling, metadata and title: 

 

.. code-block:: python  

 

from polyglot import htmlCleaner 

cleaner = htmlCleaner( 

log=log, 

settings=settings, 

url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html", 

outputDirectory="/tmp", 

title="my_clean_doc", 

style=False, 

metadata=False, 

h1=False 

) 

cleaner.clean()  

 

""" 

# INITIALISATION 

 

def __init__( 

self, 

log, 

settings, 

url, 

outputDirectory=False, 

title=False, 

style=True, 

metadata=True, 

h1=True 

): 

self.log = log 

log.debug("instansiating a new 'htmlCleaner' object") 

self.settings = settings 

self.url = url 

self.outputDirectory = outputDirectory 

self.title = title 

self.style = style 

self.metadata = metadata 

self.h1 = h1 

 

# INITIAL ACTIONS 

 

return None 

 

def clean( 

self): 

"""*parse and clean the html document with Mercury Parser* 

 

**Return:** 

- ``filePath`` -- path to the cleaned HTML document 

 

**Usage:** 

 

See class usage  

""" 

self.log.info('starting the ``clean`` method') 

 

url = self.url 

 

# PARSE THE CONTENT OF THE WEBPAGE AT THE URL 

 

parser_response = self._request_parsed_article_from_mercury(url) 

if "503" in str(parser_response): 

return None 

article = parser_response.json() 

 

if not article: 

return None 

 

# GRAB THE CSS USED TO STYLE THE WEBPAGE/PDF CONTENT 

if self.style: 

moduleDirectory = os.path.dirname(__file__) 

cssFile = moduleDirectory + "/css/main.css" 

pathToReadFile = cssFile 

readFile = codecs.open(pathToReadFile, encoding='utf-8', mode='r') 

thisCss = readFile.read() 

readFile.close() 

else: 

thisCss = "" 

 

# CATCH ERRORS 

if "error" in article and article["error"] == True: 

print url 

print " " + article["messages"] 

return None 

try: 

text = article["content"] 

except: 

print "Can't decode the text of %(url)s - moving on" % locals() 

return None 

 

# COMMON FIXES TO HTML TO RENDER CORRECTLY 

regex = re.compile( 

u'<span class="mw-editsection"><span class="mw-editsection-bracket">.*"mw-editsection-bracket">]') 

text = regex.sub(u"", text) 

regex2 = re.compile( 

u'\<sup class="noprint.*better source needed\<\/span\>\<\/a\>\<\/i\>\]\<\/sup\>', re.I) 

text = regex2.sub(u"", text) 

regex2 = re.compile( 

u'\<a href="https\:\/\/en\.wikipedia\.org\/wiki\/.*(\#.*)"\>\<span class=\"tocnumber\"\>', re.I) 

text = regex2.sub(u'<a href="\g<1>"><span class="tocnumber">', text) 

regex = re.compile( 

u'srcset=".*?">') 

text = regex.sub(u"", text) 

 

# GRAB HTML TITLE IF NOT SET IN ARGUMENTS 

if self.title == False: 

title = article["title"].encode("utf-8", "ignore") 

title = title.decode("utf-8") 

title = title.encode("ascii", "ignore") 

rstrings = """:/"&\\'`""" 

for i in rstrings: 

title = title.replace(i, "") 

 

# USE DATETIME IF TITLE STILL NOT SET 

if len(title) == 0: 

from datetime import datetime, date, time 

now = datetime.now() 

title = now.strftime("%Y%m%dt%H%M%S") 

self.title = title 

 

title = self.title.replace(".html", "") 

pageTitle = title.replace("_", " ") 

 

# REGENERATE THE HTML DOCUMENT WITH CUSTOM STYLE 

filePath = self.outputDirectory + "/" + title + ".html" 

writeFile = codecs.open( 

filePath, encoding='utf-8', mode='w') 

if self.metadata: 

metadata = "<title>%(title)s</title>" % locals() 

else: 

metadata = "" 

 

if self.h1: 

h1 = "<h1>%(pageTitle)s</h1>" % locals() 

else: 

h1 = "" 

content = u""" 

<!DOCTYPE html> 

<html> 

<head> 

<meta charset="utf-8"> 

%(metadata)s  

 

<style> 

%(thisCss)s 

</style> 

 

</head> 

<body> 

 

%(h1)s  

<a href="%(url)s">original source</a> 

</br></br> 

 

 

%(text)s  

</body> 

</html>""" % locals() 

writeFile.write(content) 

writeFile.close() 

 

self.log.info('completed the ``clean`` method') 

return filePath 

 

def _request_parsed_article_from_mercury( 

self, 

url): 

"""* request parsed article from mercury* 

 

**Key Arguments:** 

- ``url`` -- the URL to the HTML page to parse and clean 

 

**Return:** 

- None 

 

**Usage:** 

.. todo:: 

 

- add usage info 

- create a sublime snippet for usage 

- update package tutorial if needed 

 

.. code-block:: python  

 

usage code  

 

""" 

self.log.info( 

'starting the ``_request_parsed_article_from_mercury`` method') 

 

try: 

response = requests.get( 

url="https://mercury.postlight.com/parser", 

params={ 

"url": url, 

}, 

headers={ 

"x-api-key": self.settings["mercury api key"], 

}, 

) 

 

except requests.exceptions.RequestException: 

print('HTTP Request failed') 

 

self.log.info( 

'completed the ``_request_parsed_article_from_mercury`` method') 

return response 

 

# use the tab-trigger below for new method 

# xt-class-method