正则表达式爬取当当网并保存为csv

admin 发表于 2022-12-8 15:53:37

import re
import requests

#设置请求头和cookie根据要求
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
'Cookie':'__permanent_id=20221203162355205379923102881427769; __rpm=|s_112100...1670079692842; search_passback=b6219fd996d8e8e9e9118c63fc010000de89c800e1118c63; ddscreen=2; dest_area=country_id=9000&province_id=111&city_id =0&district_id=0&town_id=0; __visit_id=20221204112010093339809160925577017; __out_refer=; __trace_id=20221204112010094534951493076142788; pos_9_end=1670124010185; pos_0_start=1670124010224; pos_0_end=1670124010229; ad_ids=2095975|#1'
}

def spider(keyword, page):
for i in range(1, page):

   url = 'http://search.dangdang.com/?key=' + str(keyword) + '&page_index=' + str(i)
   print(url)
   res = requests.get(url, headers=headers,timeout=2)
   html=res.text

   for a in range(1,60):

      lis=re.findall(r'<li ddt-pit="'+str(a)+'"(.*?)>(.*?)</li>',html,re.S)
      title=re.findall(r'<a title="(.*?)"',str(lis),re.S) #书名
      price=re.findall(r'_now_price">.*?(\d+\D\d+)',str(lis),re.S) #价格
      author1=re.findall(r'<p class="search_book_author">(.*?)</p>',str(lis),re.S)

      try:
         author = re.findall(r'2=(.*?)&medium', str(author1), re.S)

         pub3 = re.findall(r'P_cbs.*?>(.*?)</a', str(author1), re.S)
         with open("dsj_2002_20200126057_.csv", "a", encoding='utf_8_sig') as f2:
            f2.writelines(title + "," + str(price) + "," + str(author) + "," + str(pub3) + '\n')

            print('\n书籍：', title, '\n价格：', price, '\n作者：', author, '\n出版社：', pub3)

      except:
         pub1 = re.findall(r'dd_name.*?t.*?>.*?社?<', str(author1), re.S)
         pub = re.findall(r'>(.*?)<', str(pub1), re.S)
         with open("dsj_2002_20200126057_csv", "a", encoding='utf_8_sig') as f2:
               f2.writelines(title + "," + str(price) + "," + '佚名' + "," + str(pub) + '\n')

               print('\n书籍：', title, '\n价格：', price, '\n作者：', '佚名', '\n出版社：', pub)

if __name__ == '__main__':
keyword = 'python爬虫'
page = 7
spider(keyword, page)

页: [1]

php中文网 | cnphp.com's Archiver

正则表达式爬取当当网并保存为csv