1
|
|
2
|
|
3
|
from html.parser import HTMLParser
|
4
|
import httplib2
|
5
|
|
6
|
|
7
|
class LinksParser(HTMLParser):
|
8
|
def __init__(self):
|
9
|
HTMLParser.__init__(self)
|
10
|
self.recording = 0
|
11
|
self.data = []
|
12
|
|
13
|
def handle_starttag(self, tag, attributes):
|
14
|
if tag != 'span':
|
15
|
return
|
16
|
if self.recording:
|
17
|
self.recording += 1
|
18
|
return
|
19
|
for name, value in attributes:
|
20
|
if name == 'class' and value == 'parse_me':
|
21
|
break
|
22
|
else:
|
23
|
return
|
24
|
self.recording = 1
|
25
|
|
26
|
def handle_endtag(self, tag):
|
27
|
if tag == 'span' and self.recording:
|
28
|
self.recording -= 1
|
29
|
|
30
|
def handle_data(self, data):
|
31
|
if self.recording:
|
32
|
self.data.append(data)
|
33
|
|
34
|
def give_nids(url):
|
35
|
p = LinksParser()
|
36
|
h = httplib2.Http()
|
37
|
|
38
|
resp, content = h.request(url, 'GET')
|
39
|
text = content.decode('utf-8')
|
40
|
|
41
|
p.feed(text)
|
42
|
return p.data
|
43
|
|
44
|
def give_json_urls(url, base_url):
|
45
|
nids = give_nids(url)
|
46
|
tache_urls = []
|
47
|
for nid in nids:
|
48
|
tache_urls.append(base_url + '/node/' + nid + '.json')
|
49
|
return nids, tache_urls
|