900字范文 > python爬虫实现大麦抢票_爬虫大麦网

python爬虫实现大麦抢票_爬虫大麦网

时间：2023-04-20 13:23:11

爬大麦网总体与上一篇博客爬豆瓣电影类似,大麦的选项较多,地点,活动类型等等

本文章与爬豆瓣相比额外使用了如下功能:

docopt从终端获取参数

prettytable整理打印格式

具体代码如下：

shows.py

#!/usr/bin/env python

# -*- coding: utf-8 -*-

"""

Usage:

shows

"""

import os

import re

import csv

import sys

from prettytable import PrettyTable

from urllib.request import urlopen

from bs4 import BeautifulSoup

from docopt import docopt

sys.path.append('/home/han/PycharmProjects/TinyPythonProject/Beginner/shows')

from cities import cities

# 大麦网找活动

DAMAI_BASE_URL = "/projectlist.do"

# ERR MSG

QUERY_DAYS_INVALID = 'Invalid days.'

CITY_NOT_FOUND = 'Sorry, your city is not supported.'

SHOW_NOT_FOUND = 'No result.'

# 活动类型

SHOW_TYPES = {

'演唱会': {'mcid': 1, 'ccid': ''},

'流行': {'mcid': 1, 'ccid': '9'},

'摇滚': {'mcid': 1, 'ccid': '10'},

'民族': {'mcid': 1, 'ccid': '11'},

'音乐节': {'mcid': 1, 'ccid': '12'},

'音乐会': {'mcid': 2, 'ccid': ''},

'话剧歌剧': {'mcid': 3, 'ccid': ''},

'话剧': {'mcid': 3, 'ccid': 19},

'歌剧': {'mcid': 3, 'ccid': 20},

'歌舞剧': {'mcid': 3, 'ccid': 21},

'音乐剧': {'mcid': 3, 'ccid': 22},

'儿童剧': {'mcid': 3, 'ccid': 23},

'舞蹈': {'mcid': 4, 'ccid': '24'},

'芭蕾': {'mcid': 4, 'ccid': '25'},

'舞剧': {'mcid': 4, 'ccid': '26'},

'相声': {'mcid': 5, 'ccid': 27},

'魔术': {'mcid': 5, 'ccid': 28},

'马戏': {'mcid': 5, 'ccid': 29},

'杂技': {'mcid': 5, 'ccid': 30},

'戏曲': {'mcid': 5, 'ccid': 31},

'比赛': {'mcid': 6, 'ccid': ''},

}

# 爬title

def getTitle(html):

titleList = re.findall(

', html,

re.S)

newTitleList = []

for index, item in enumerate(titleList):

if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(

"icon") == -1:

newTitleList.append(item)

return newTitleList

# 爬title

def getDetial(html):

detialList = re.findall(

', html, re.S)

newDetialList = []

for index, item in enumerate(detialList):

newDetialList.append(item)

return newDetialList

# 爬图片链接

def getImg(html):

imgList = re.findall(

', html,

re.S)

newImgList = []

for index, item in enumerate(imgList):

if item.find("js") == -1 and item.find("css") == -1 and item.find("dale") == -1 and item.find(

"icon") == -1:

newImgList.append(item)

return newImgList

# 爬时间

def getTime(html):

timeList = re.findall(r'

.*?时间：(.*?)', html, re.S)

newTimeList = []

for index, item in enumerate(timeList):

newTimeList.append(item)

return newTimeList

# 爬场馆

def getPlace(html):

placeList = re.findall(r'场馆：(.*?).*?', html, re.S)

newPlaceList = []

for index, item in enumerate(placeList):

newPlaceList.append(item)

return newPlaceList

# 爬票价

def getPrice(html):

priceList = re.findall(r'(.*?)', html, re.S)

newPriceList = []

for index, item in enumerate(priceList):

newPriceList.append(item)

return newPriceList

# 爬状态

def getStatus(html):

statusList = re.findall(r'

状态: (.*?)

', html, re.S)

newStatusList = []

for index, item in enumerate(statusList):

newStatusList.append(item)

return newStatusList

# 将url转化成html

def getHtml(url):

try:

page = urlopen(url)

html = page.read()

except Exception as e:

print("failed to geturl:", e)

return ""

else:

return html

# 将获取的信息进行保存

def saveInfo(infoList):

with open('/home/han/PycharmProjects/TinyPythonProject/Beginner/shows/shows_scraper.csv', 'w+', newline='',

encoding='utf-8') as fp:

a = csv.writer(fp, delimiter=',') # delimiter的意思是插入到csv文件中的一行记录以它分隔开

a.writerow(['活动', '活动链接', '图片', '时间', '场馆', '价格', '状态'])

a.writerows(infoList)

print('保存完毕')

# 按格式输出

def pretty_print(infoList):

pt = PrettyTable()

pt._set_field_names(['活动', '活动链接', '图片', '时间', '场馆', '价格', '状态'])

for info in infoList:

pt.add_row(info)

print(pt)

# 初始化

titles = []

details = []

imgs = []

times = []

places = []

prices = []

status = []

allInfo = []

arguments = docopt(__doc__)

city = cities.get(arguments[''])

type = SHOW_TYPES.get(arguments[''])

url = ('/projectlist.do?cityID={}&mcid={}&ccid={}').format(

city, type['mcid'], type['ccid']

)

html = urlopen(url)

bsObj = BeautifulSoup(html, 'html.parser')

page_num_text = bsObj.findAll("span", {"class": "ml10"})[0].get_text()

page_num = int(page_num_text[page_num_text.index('共') + 1:page_num_text.index('页')])

print("共%d页" % page_num) # 得到活动一共多少页

for page in range(1, page_num + 1):

url = ('/projectlist.do?cityID={}&mcid={}&ccid={}&pageIndex={}').format(

city, type['mcid'], type['ccid'], page

)

print("page:%d,url:%s" % (page, url))

html = getHtml(url).decode("UTF-8")

if (html == ''):

titles.extend('none')

details.extend('none')

imgs.extend('none')

times.extend('none')

places.extend('none')

prices.extend('none')

status.extend('none')

else:

titles.extend(getTitle(html))

details.extend(getDetial(html))

imgs.extend(getImg(html))

times.extend(getTime(html))

places.extend(getPlace(html))

prices.extend(getPrice(html))

status.extend(getStatus(html))

print(len(titles))

print(len(details))

print(len(imgs))

print(len(times))

print(len(places))

print(len(prices))

print(len(status))

for i in range(0, len(titles)):

tmp = []

tmp.append(titles[i])

tmp.append('https:' + details[i])

tmp.append('https:' + imgs[i])

tmp.append(times[i])

tmp.append(places[i])

tmp.append(prices[i])

tmp.append(status[i])

allInfo.append(tmp)

saveInfo(allInfo) # 保存为csv格式文件

pretty_print(allInfo) # prettytable格式整理打印

cities.py

#!/usr/bin/env python

# -*- coding: utf-8 -*-

cities = {

'北京': '852',

'上海': '872',

'广州': '893',

'深圳': '906',

'武汉': '586',

'苏州': '1087',

'成都': '1377',

'重庆': '200',

'长沙': '702',

'南京': '1038',

'杭州': '1580',

'沈阳': '1703',

'无锡': '1052',

'宁波': '1597',

'郑州': '2148',

'天津': '1209',

'大连': '1725',

'南昌': '465',

'西安': '3250',

'常州': '1077',

'昆明': '1229',

'桂林': '2103',

'厦门': '372',

'太原': '2984',

'福州': '356',

'温州': '1612',

'合肥': '2520',

'珠海': '913',

'中山': '947',

'石家庄': '2495',

'佛山': '923',

'南宁': '2024',

'长春': '2812',

'哈尔滨': '2648',

'香港': '848',

'青岛': '1847',

'澳门': '850',

'贵阳': '242',

'济南': '1835',

'东莞': '917',

'呼和浩特': '3167',

'银川': '54',

'海外': '76',

'柳州': '2037',

'徐州': '2024',

'绍兴': '1643'

}

运行格式为 python3 shows.py [地点] [演出类型]

例如python3 shows.py 北京演唱会

注意：地点必须是cities.py中的地点，类型必须为 SHOW_TYPES中的活动类型

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。

python爬虫实现大麦抢票_爬虫 大麦网

python爬虫实现大麦抢票_爬虫大麦网