正则表达式爬取当当网并保存为csv
import reimport requests
#设置请求头和cookie根据要求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
'Cookie':'__permanent_id=20221203162355205379923102881427769; __rpm=|s_112100...1670079692842; search_passback=b6219fd996d8e8e9e9118c63fc010000de89c800e1118c63; ddscreen=2; dest_area=country_id=9000&province_id=111&city_id =0&district_id=0&town_id=0; __visit_id=20221204112010093339809160925577017; __out_refer=; __trace_id=20221204112010094534951493076142788; pos_9_end=1670124010185; pos_0_start=1670124010224; pos_0_end=1670124010229; ad_ids=2095975|#1'
}
def spider(keyword, page):
for i in range(1, page):
url = 'http://search.dangdang.com/?key=' + str(keyword) + '&page_index=' + str(i)
print(url)
res = requests.get(url, headers=headers,timeout=2)
html=res.text
for a in range(1,60):
lis=re.findall(r'<li ddt-pit="'+str(a)+'"(.*?)>(.*?)</li>',html,re.S)
title=re.findall(r'<a title="(.*?)"',str(lis),re.S) #书名
price=re.findall(r'_now_price">.*?(\d+\D\d+)',str(lis),re.S) #价格
author1=re.findall(r'<p class="search_book_author">(.*?)</p>',str(lis),re.S)
try:
author = re.findall(r'2=(.*?)&medium', str(author1), re.S)
pub3 = re.findall(r'P_cbs.*?>(.*?)</a', str(author1), re.S)
with open("dsj_2002_20200126057_.csv", "a", encoding='utf_8_sig') as f2:
f2.writelines(title + "," + str(price) + "," + str(author) + "," + str(pub3) + '\n')
print('\n书籍:', title, '\n价格:', price, '\n作者:', author, '\n出版社:', pub3)
except:
pub1 = re.findall(r'dd_name.*?t.*?>.*?社?<', str(author1), re.S)
pub = re.findall(r'>(.*?)<', str(pub1), re.S)
with open("dsj_2002_20200126057_csv", "a", encoding='utf_8_sig') as f2:
f2.writelines(title + "," + str(price) + "," + '佚名' + "," + str(pub) + '\n')
print('\n书籍:', title, '\n价格:', price, '\n作者:', '佚名', '\n出版社:', pub)
if __name__ == '__main__':
keyword = 'python爬虫'
page = 7
spider(keyword, page)
页:
[1]