1
|
from html.parser import HTMLParser
|
2
|
import re
|
3
|
|
4
|
class LinksParser(HTMLParser):
|
5
|
def __init__(self):
|
6
|
HTMLParser.__init__(self)
|
7
|
self.recording = 0
|
8
|
self.data = []
|
9
|
|
10
|
def handle_starttag(self, tag, attributes):
|
11
|
if tag != 'div':
|
12
|
return
|
13
|
if self.recording:
|
14
|
self.recording += 1
|
15
|
return
|
16
|
for name, value in attributes:
|
17
|
if name == 'class' and value == 'comment-content':
|
18
|
break
|
19
|
else:
|
20
|
return
|
21
|
self.recording = 1
|
22
|
|
23
|
def handle_endtag(self, tag):
|
24
|
if tag == 'div' and self.recording:
|
25
|
self.recording -= 1
|
26
|
|
27
|
def handle_data(self, data):
|
28
|
if self.recording:
|
29
|
self.data.append(data)
|
30
|
|
31
|
|
32
|
def parse_comment(input):
|
33
|
p = LinksParser()
|
34
|
|
35
|
output = ""
|
36
|
for ligne in input:
|
37
|
ligne = re.sub(r'<p( class="[a-z -]*")?( id="[a-z]*")?>', r'', ligne)
|
38
|
|
39
|
ligne = re.sub(r'</p>', r'', ligne)
|
40
|
ligne = re.sub(r'<h([1-9])( class="[a-z -]*")?( id="[a-z]*")?( rel="nofollow")?>\n?', r'\nh\1. ', ligne)
|
41
|
ligne = re.sub(r'</h[1-9]>', r'\n', ligne)
|
42
|
ligne = re.sub(r'^[ \t]+', '', ligne)
|
43
|
ligne = re.sub(r'<br />', r'\n', ligne)
|
44
|
ligne = re.sub(r'<li( class="[a-z -]*")?( id="[a-z]*")?>', r'', ligne)
|
45
|
ligne = re.sub(r'(.*)</li>', r'# \1', ligne)
|
46
|
ligne = re.sub(r'<ol( class="[a-z -]*")?( id="[a-z]*")?>', r'', ligne)
|
47
|
ligne = re.sub(r'</ol>', r'', ligne)
|
48
|
ligne = re.sub(r'<ul( class="[a-z -]*")?( id="[a-z]*")?>', r'', ligne)
|
49
|
ligne = re.sub(r'</ul>', r'', ligne)
|
50
|
ligne = re.sub(r'<pre( class="[a-z -]*")?( id="[a-z]*")?>', r'balise_pre', ligne)
|
51
|
ligne = re.sub(r'</pre>', r'/balise_pre', ligne)
|
52
|
ligne = re.sub(r'<code( class="[a-z -]*")?( id="[a-z]*")?>', r'balise_code', ligne)
|
53
|
ligne = re.sub(r'</code>', r'/balise_code', ligne)
|
54
|
ligne = re.sub(r'<em( class="[a-z -]*")?( id="[a-z]*")?>', r'_', ligne)
|
55
|
ligne = re.sub(r'</em>', r'_', ligne)
|
56
|
ligne = re.sub(r'<b( class="[a-z -]*")?( id="[a-z]*")?>', r'*', ligne)
|
57
|
ligne = re.sub(r'</b>', r'*', ligne)
|
58
|
ligne = re.sub(r'<strong( class="[a-z -]*")?( id="[a-z]*")?>', r'*', ligne)
|
59
|
ligne = re.sub(r'</strong>', r'*', ligne)
|
60
|
|
61
|
ligne = re.sub(r'<a', r'', ligne)
|
62
|
ligne = re.sub(r'href="(.*)"', r'\1', ligne)
|
63
|
ligne = re.sub(r'</a>', r'', ligne)
|
64
|
output += ligne
|
65
|
a = open("k", 'w')
|
66
|
a.write(output)
|
67
|
|
68
|
p.feed(output)
|
69
|
|
70
|
list_comments = list()
|
71
|
for comment in p.data:
|
72
|
comment = re.sub(r'^\n*', r'', comment)
|
73
|
comment = re.sub(r'\n*$', r'', comment)
|
74
|
comment = re.sub(r'^ *', r'', comment)
|
75
|
comment = re.sub(r' *$', r'', comment)
|
76
|
comment = re.sub(r'/balise_pre', r'</pre>', comment)
|
77
|
comment = re.sub(r'balise_pre', r'<pre>', comment)
|
78
|
comment = re.sub(r'/balise_code', r'</code> ', comment)
|
79
|
comment = re.sub(r'balise_code', r'<code> ', comment)
|
80
|
if comment:
|
81
|
list_comments.append(comment)
|
82
|
return list_comments
|