1
|
|
2
|
|
3
|
from html.parser import HTMLParser
|
4
|
import httplib2
|
5
|
|
6
|
|
7
|
class LinksParser(HTMLParser):
|
8
|
"Classe permettant de parser du html"
|
9
|
def __init__(self):
|
10
|
HTMLParser.__init__(self)
|
11
|
self.recording = 0
|
12
|
self.data = []
|
13
|
|
14
|
def handle_starttag(self, tag, attributes):
|
15
|
if tag != 'span':
|
16
|
return
|
17
|
if self.recording:
|
18
|
self.recording += 1
|
19
|
return
|
20
|
for name, value in attributes:
|
21
|
if name == 'class' and value == 'parse_me':
|
22
|
break
|
23
|
else:
|
24
|
return
|
25
|
self.recording = 1
|
26
|
|
27
|
def handle_endtag(self, tag):
|
28
|
if tag == 'span' and self.recording:
|
29
|
self.recording -= 1
|
30
|
|
31
|
def handle_data(self, data):
|
32
|
if self.recording:
|
33
|
self.data.append(data)
|
34
|
|
35
|
def give_nids(url):
|
36
|
p = LinksParser()
|
37
|
h = httplib2.Http()
|
38
|
|
39
|
resp, content = h.request(url, 'GET')
|
40
|
text = content.decode('utf-8')
|
41
|
|
42
|
p.feed(text)
|
43
|
return p.data
|
44
|
|
45
|
def give_json_urls(url, base_url):
|
46
|
nids = give_nids(url)
|
47
|
tache_urls = []
|
48
|
for nid in nids:
|
49
|
tache_urls.append(base_url + '/node/' + nid + '.json')
|
50
|
return nids, tache_urls
|