用Python实现的小爬虫~哦也! 突然发现自己素质好低 title

# -*- coding:utf-8 -*-
from lxml import *
import urllib
import lxml.html
import httplib
import time

def getHtml(url):
url = "[和谐!嘿嘿]" + url print 'get HTML from :'
print url
h1= urllib.urlopen(url)
c = h1.read()
h1.close
time.sleep(10)
return lxml.html.document_fromstring(c)

def esc(st):
return urllib.quote(st.encode('utf8'))
def postData(server,params):
headers={'Accept': 'text/html', 'User-Agent': 'Mozilla','Content-Type': 'application/x-www-form-urlencoded'}
path ='/***[和谐哦]'
conn = httplib.HTTPConnection(server)
conn.request('POST',path,params,headers)
r1 = conn.getresponse()
print r1.read()
conn.close()

def keepArtical(classname,title,content,tags):
server = '127.0.0.1'
#print urllib.quote(classname)
#print urllib.urlencode({'title':classname})
#print title
params = urllib.urlencode({'title':esc(title),'class':esc(classname),'content':esc(content),'tags':esc(tags)})
postData(server,params)

def readArtical(url):
try:
htm = getHtml(url)
title = htm.xpath("//*[@id='changename']")
content = htm.xpath("//div[@id='main']")
cate = htm.xpath("/html/body/div[2]/div/label/a")
category = cate[len(cate) - 1].text_content()
tags = htm.xpath("/html/body/div[2]/p[3]/a")
if len(tags)==0:
tag = 'By bug !'
else:
tag = tags[0].text_content()
c = lxml.html.tostring(content[0])
print 'try to submit ...'
keepArtical(category,title[0].text_content(),c,tag)
except:
print 'Error happened on url :'
print url

def getArticallist(htm):
list = htm.xpath("//div[@id='fenlei']/dl/dt/a")
for l in range(len(list)):
str = list[l].get("href")
readArtical(str)

def getAllArticalList(url):
htm = getHtml(url)
getArticallist(htm)
try:
plist = htm.xpath("//div[@id='fenye']/a")
if plist[len(plist) - 1].text_content().find(urllib.unquote(u'\u203a')) != -1 :
nurl = plist[len(plist) - 1].get("href")
getAllArticalList(nurl)
elif plist[len(plist) - 2].text_content().find(urllib.unquote(u'\u203a')) != -1 :
nurl = plist[len(plist) - 2].get("href")
getAllArticalList(nurl)
except:
print 'no other list ! do next '

def getCategory():
htm = getHtml('index.php?category.htm')
print 'get all categorys...'
smallcate = htm.xpath("//div[@id='bkfl_list']/dl/dd/a")
for sc in range(len(smallcate)):
str = smallcate[sc].get('href')
getAllArticalList(str)
getCategory()