Python爬虫实行Web数据发现总括和深入分析,链接剖判之链接总括

Python爬虫实行Web数据发现总括和深入分析,链接剖判之链接总括

今昔我们组剖判源码和计算深入分析链接的办事正在同步举行,稍后还有分析源码和计算剖析链接的进度报告表露。

行使Python爬虫进行Web数据发现早已特别常见,网络的种种Python爬虫资料教程比较多,可是比相当少有人对Web数据开采举行系统地总计和分析。

[python]
# Copyright (C) 2012 xxx(xxx) Co., LTD. 
# All rights reserved. 

# Developed by RD BIOS Team. 

# Authors: perry
<[email protected]> 

# Date: January 11, 2012 

# Project Name: WEBDOWN 
# Project Version: 1.0.0 

# Project descrition: 

# History: 
#    Date        Auther      Description 
#   
—————————————————————– 
#    2012/01/11  perry       created. 

# Note: 
#  xxx 
 
__version__ = “1.0.0” 
 
import os, sys, io 
 
import sqlite3 
 
try: 
  # Python 2.7 
  from urlparse import urlparse 
  from urllib import ( 
    unquote, 
    url2pathname) 
 
except ImportError: 
  # Python 3.2 
  from urllib.parse import urlparse 
 
try: 
  # Python 2.7 
  from HTMLParser import HTMLParser 
except ImportError: 
  # Python 3.2 
  from html.parser import HTMLParser 
 
try: 
  # Python 2.7 
  from httplib import HTTPConnection 
except ImportError: 
  # Python 3.2 
  from http.client import HTTPConnection 
 
import time 
import threading 
 
class DownloadThread(threading.Thread): 
  def __init__(self, wd): 
    self.wd = wd 
    threading.Thread.__init__(self) 
     
  def run(self): 
    http = HTTPConnection(wd.url) 
     
    while True: 
      s = self.wd.get1() 
      if s is None: 
        if not self.wd.finished: 
          break 
        time.sleep(1) 
        continue       
         
      x = unquote(s.encode(sys.stdin.encoding)) 
      p = os.getcwd() + url2pathname(x) 
       
      if not os.path.exists(p): 
        try: 
          http.close() 
          http.request(‘GET’, s) 
          r = http.getresponse() 
          if r.status == 200: 
            print r.getheader(‘content-length’, 0), s 
                       
            try: 
              f = open(p, ‘wb’) 
              f.write(r.read()) 
            finally: 
              f.close()         
        except: 
          print ‘FAIL ‘, s 
      else: 
        print ‘EXISTS ‘, s 
         
      self.wd.set1(s, 1) 
       
    print(‘exit…’) 
 
class Webdown(HTMLParser): 
  finished = False 
  def __init__(self, url): 
    try: 
      url_info = urlparse(url, ‘http’) 
      self.url = url_info.netloc 
      self.http = HTTPConnection(url_info.netloc) 
      self.dbc = sqlite3.connect(‘:memory:’, check_same_thread =
False) 
      self.lock = threading.Lock() 
      self.path = url_info.path 
      self.dbc.execute(””’
        create table if not exists download (
          id integer primary key autoincrement,
          name text,
          url text,
          path text,
          local_path text,
          is_dir integer default 0,
          is_searched integer default 0,
          is_queried integer default 0,
          is_download integer default 0)”’) 
 
      name = self.path 
      while name.endswith(‘/’): 
        name = name[:-1] 
      self.path = name + ‘/’ 
 
      i = name.rfind(‘/’) 
      if i > 0: 
        name = name[i + 1:] 
 
      self.puturl(name, self.url, self.path, os.getcwd(), 1) 
    except: 
      print(‘WebDown initialize failure…’) 
 
    HTMLParser.__init__(self) 
 
  def handle_starttag(self, tag, attrs): 
    if tag != ‘a’ or len(attrs) != 1 or attrs[0][0] != ‘href’: 
      return 
 
    href = attrs[0][1] 
    if href == ‘../’:     # ignore the parent folder. 
      return 
 
    if href == ‘./’:      # ignore the current folder. 
      return 
       
    if href.startswith(‘?’): 
      return 
       
    if href.startswith(‘~’): 
      return 
 
    dir = 0 
    name = href 
    searched = 1 
 
    if name.endswith(‘/’): 
      name = name[:-1] 
      searched = 0 
      dir = 1 
 
    self.puturl(name, self.url, self.path + href, ”, dir, searched) 
 
  def puturl(self, name, url, path, lpath=”, isdir=0, searched=0): 
    self.lock.acquire() 
    self.dbc.execute(‘insert into download
(name,url,path,local_path,is_dir,is_searched) values(?,?,?,?,?,?)’,

      name,url, path, lpath, isdir, searched)) 
    self.lock.release() 
 
  def set1(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute(‘update download set is_queried=? where path=?’,
(status, path)) 
    self.lock.release() 
 
  def get1(self): 
    self.lock.acquire() 
    r = self.dbc.execute(‘select path from download where is_dir=0 and
is_queried=0 limit 1’) 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None: 
      return s[0] 
    return s 
 
  def set2(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute(‘update download set is_searched=? where path=?’,
(status, path)) 
    self.lock.release() 
     
  def get2(self, url): 
    self.lock.acquire() 
    r = self.dbc.execute(‘select path from download where url=? and
is_searched=0 and is_dir=1 limit 1’, (url,)) 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None and s[0] is not None: 
      s = s[0] 
      if not s.endswith(‘/’): 
        s = s + ‘/’ 
    return s 
 
  def set3(self, path, status=0): 
    self.lock.acquire() 
    self.dbc.execute(‘update download set is_download=? where path=?’,
(status, path)) 
 
  def get3(self): 
    self.lock.acquire() 
    r = self.dbc.execute(‘select path from download where is_dir=0 and
is_download=0 limit 1’) 
    s = r.fetchone() 
    self.lock.release() 
     
    if s is not None: 
      return s[0] 
    return s 
 
  def go(self): 
    self.finished = True 
    q = DownloadThread(self) 
    q.start() 
    while self.path is not None: 
      try: 
        s = unquote(self.path.encode(sys.stdin.encoding)) 
        p = os.getcwd() + url2pathname(s) 
        if not os.path.exists(p): 
          os.makedirs(p) 
        #print(s) 
      except: 
        pass 
 
      try: 
        self.http.close() 
        self.http.request(‘GET’, self.path) 
        r = self.http.getresponse() 
        if r.status == 200: 
          self.reset() 
          self.feed(r.read()) 
      except: 
        pass 
 
      self.set2(self.path, 1) 
      self.path = self.get2(self.url) 
       
    self.finished = False 
    q.join() 
 
if __name__ == “__main__”: 
  if len(sys.argv) > 1: 
    url = sys.argv[0] 
    url = url.strip() 
  else: 
    #  
    #  
    print(‘You must provide a valid Url.\n’) 
    print(‘Usage:\n  Python %s target’ %
os.path.basename(sys.argv[0])) 
    print(‘    target   — specify a URL to donwload.\n’) 
    url = ” 
    while len(url) == 0: 
      if sys.version.startswith(‘3.2’): 
        url = input(‘Please enter a URL:’) 
      else: 
        url = raw_input(‘Please enter a URL:’) 
      url = url.strip() 
  wd = Webdown(url) 
  wd.go() 
 
摘自  perry_peng的专栏 

正文说的是何许深入分析链接关系,供总计深入分析之用。

图片 1

] # Copyright (C) 2012 xxx(xxx) Co.,
LTD. # All rights reserved. # # Developed by RD BIOS Team. # #
Authors: perry
[email protected]
# # Date: January 11, 2012 # # Projec…

一句话——人生苦短,小编用Python。

0x01 Web数据开采类型

基本工作规律是遍历mirror下边包车型客车网页, 用正则表明式剖判出链接地址,
然后输出链接关系.
最后取得的文书能够当作下三个顺序的输入, 以总结网页出度入度和计算PEnclave值.

动用python爬虫实行Web数据发掘早就特别常见,互连网的种种Python爬虫资料教程非常多,不过比较少有人对Web数据发掘实行系统地计算和剖判。

以下是源码:

从目标上来说,Web数据开掘分为三类。最广大的是对此网址内容的爬取,包罗文件、图片和文件等;其次是对于网站组织的爬取,包含网址目录,链接之间的相互跳转关系,二级域名等;还也许有一种爬虫是对此Web应用数据的掘进,富含获取网址CMS类型,Web插件等。

 1 # coding: utf-8
 2 # 
 3 
 4 import os, re
 5 
 6 rootdir=
‘/home/xxx/workspace/heritrix/jobs/ccer-20100930010817713/mirror/www.ccer.pku.edu.cn’
 7 
 8 dotfile = open(‘links.data’, ‘w’, 4096000)
 9 
10 count = 0
11 urllist = []
12 
13 def append2list(url):
14     if url not in urllist:
15         urllist.append(url)
16     return urllist.index(url)
17 
18 def extract(dirr, name):
19     #print “extracting:”, dirr, name
20     f = open(dirr+’/’+name, ‘r’)
21     cururl = ‘http://’ + dirr[dirr.find(‘www.ccer.pku.edu.cn’):] +
‘/’ + name
22     curindex = append2list(cururl)
23 
24     hrefs =
re.findall(r”’href=(‘|”)?([^\s'”><()]+)(\1?)”’, f.read())
25     for href in hrefs:
26         if not href[0] == href[2]\
27             or href[1] == ‘#’\
28             or href[1] == ‘./’\
29             or href[1].startswith(‘mailto:’)\
30             or href[1].startswith(‘javascript’)\
31             or href[1].endswith(‘.css’)\
32             or href[1].endswith(‘.jpg’)\
33             or href[1].endswith(‘.bmp’)\
34             or href[1].endswith(‘.jpeg’)\
35             or href[1].endswith(‘.ico’)\
36             or href[1].endswith(‘.gif’)\
37             or href[1].endswith(‘.pdf’)\
38             or href[1].endswith(‘.ppt’)\
39             or href[1].endswith(‘.doc’)\
40             or href[1].endswith(‘.xls’)\
41             or href[1].endswith(‘.pptx’)\
42             or href[1].endswith(‘.docx’)\
43             or href[1].endswith(‘.xlsx’)\
44             or href[1].endswith(‘.zip’)\
45             or href[1].endswith(‘.png’):
46             pass
47         else:
48             realref = href[1]
49             if not realref.startswith(‘http’): #relative links
50                 if ‘.asp?’ in realref:
51                     realref = realref.replace(‘.asp?’, ”, 1) +
‘.asp’ # file name on disk
52                 realref = ‘http://’ +
dirr[dirr.find(‘www.ccer.pku.edu.cn’):] + ‘/’ + realref
53             #print realref
54             refindex = append2list(realref)
55             global count
56             dotfile.write(‘%d %d\n’ % (curindex, refindex))
57             count += 1
58             if count % 10000 == 0:
59                 print count
60     #f.close()
61 
62 def filter(dummy, dirr, filess):
63     for name in filess:
64         if os.path.splitext(name)[1] in [‘.asp’, ‘.htm’, ‘.html’]
and os.path.isfile(dirr+’/’+name):
65             extract(dirr, name)
66 
67 os.path.walk(rootdir, filter, None)
68 
69 dotfile.close()
70 
71 urlfile = open(‘linkindex.txt’, ‘w’, 4096000)
72 for url in urllist:
73     urlfile.write(url + ‘\n’)
74 urlfile.close()

0x02 网址内容发掘

图片 2

网址内容开掘利用最广,最为广泛,英特网的Python爬虫资料许多也都属于那类。爬取下的故事情节也可用来相当多地点。

Python编写那类爬虫的宽广思路正是选取request或urllib2库定制诉求,利用BeautifulSoup对原始网页进行解析,定位一定html标签,搜索目的内容。借使要抓牢质量,能够接纳threading启用八线程,gevent启用协程(在windows上使用只怕会稍为难题),也能够用multiprocessing运维多进程。multiprocessing能突破python的GIL全局解释器锁的界定。

那类爬虫资料实在太多,在此间不再赘述了。

0x03 网址协会发掘

网站组织发掘实际不是很广阔,但在一部分特有的选用场景,大家也会用到。举个例子对于Web漏洞扫描器,爬取网址整站目录,获取二级域名是极为首要的。在率先类网址内容挖掘中,有时也急需将对象网址有个别页面作为入口,对总体网址有着剧情进行获取和剖析,这种情景下就要求对网址组织举行辨析。

对此网址目录爬取,必要思量的一个首要难题正是爬虫品质。常常网址的页面会比很多,倘使直接得到具备目录,或者会消耗大批量岁月。其它,对于网址链接的检索战术对爬虫的习性也会发生相当的大影响。一般处境下,大家会选拔广度优先找出,从进口页面早先,获取该页面内有所链接,并认清链接是不是是站内链接,是还是不是曾经爬取过。为了提升速度,能够对链接进行综合,将/page.php?id=1与/page.php?id=2以为是均等类别链接,不举办再次爬取。轻巧完毕代码如下:

1 # coding=utf-8

2 ”’

3 爬取网址所有目录

4 Author: bsdr

5 Email: 1340447902@qq.com

6 ”’

7 import urllib2

8 import re

9 from BeautifulSoup import BeautifulSoup

10 import time

11

12 t = time.time()

13

14 HOST = ”

15 CHECKED_URL = [] # 已检查测验的url准则

16 CHECKING_URL = [] # 待检查测试的url

17 RESULT = [] # 检查测量试验结果

18 RETRY = 3 # 重复尝试次数

19 TIMEOUT = 2 # 超时

20

21

22 class url_node:

23 def __init__(self, url):

24 ”’

25 url节点开端化

26 :param url: String, 当前url

27 :return:

28 ”’

29 # self.deep = deep

30 self.url = self.handle_url(url, is_next_url=False)

31 self.next_url = []

32 self.content = ”

33

34

35 def handle_url(self, url, is_next_url=True):

36 ”’

37 将持有url管理成标准格式

38

39 :param url: String

40 :param is_next_url: Bool,
判别传入的url是日前亟需检查评定的url照旧下一层url

41 :return: 再次来到空或错误音信或不利url

42 ”’

43 global CHECKED_URL

44 global CHECKING_URL

45

46 # 去掉结尾的’/‘

47 url = url[0:len – 1] if url.endswith else url

48

49 if url.find == -1:

50 if not url.startswith:

51 url = ‘http://’ + HOST + url if url.startswith else ‘http://’ + HOST

  • ‘/’ + url

52 else:

53 # 假设url的host不为当前host,重回空

54 return

55 else:

56 if not url.startswith:

57 url = ‘http://’ + url

58

59 if is_next_url:

60 # 下一层url纳入待检查测量检验列表

61 CHECKING_URL.append

62 else:

63 # 对于当下亟待检查实验的url

64 # 将中间的装有参数替换为1

65 # 然后投入url法则表

66 # 参数差异,类型一样的url,只检验一次

67 rule = re.compile(r’=.*?&|=.*?$’)

68 result = re.sub(rule, ‘=1&’, url)

69 if result in CHECKED_URL:

70 return ‘[!] Url has checked!’

71 else:

72 CHECKED_URL.append

73 RESULT.append

74

75 return url

76

77

78 def __is_connectable:

79 # 验证是还是不是足以连接

80 retry = 3

81 timeout = 2

82 for i in range:

83 try:

84 response = urllib2.urlopen(self.url, timeout=TIMEOUT)

85 return True

86 except:

87 if i == retry – 1:

88 return False

89

90

91 def get_next:

92 # 获取当前页面全部url

93 soup = BeautifulSoup(self.content)

94 next_urls = soup.findAll

95 if len(next_urls) != 0:

96 for link in next_urls:

97 self.handle_url(link.get

98

99

100 def run:

101 if self.url:

102 print self.url

103 if self.__is_connectable():

104 try:

105 self.content = urllib2.urlopen(self.url, timeout=TIMEOUT).read()

106 self.get_next()

107 except:

108 print(‘[!] Connect Failed’)

109

110

111 class Poc:

112 def run(self, url):

113 global HOST

114 global CHECKING_URL

115 url = check_url

116

117 if not url.find:

118 HOST = url[8:]

119 else:

120 HOST = url[7:]

121

122 for url in CHECKING_URL:

123 print

124 url_node.run()

125

126

127 def check_url:

128 url = ‘http://’ + url if not url.startswith else url

129 url = url[0:len – 1] if url.endswith else url

130

131 for i in range:

132 try:

133 response = urllib2.urlopen(url, timeout=TIMEOUT)

134 return url

135 except:

136 raise Exception(“Connect error”)

137

138

139 if __name__ == ‘__main__’:

发表评论

电子邮件地址不会被公开。 必填项已用*标注

网站地图xml地图