admin 发表于 2022-12-8 15:53:37

正则表达式爬取当当网并保存为csv

import re
import requests

#设置请求头和cookie根据要求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER',
   'Cookie':'__permanent_id=20221203162355205379923102881427769; __rpm=|s_112100...1670079692842; search_passback=b6219fd996d8e8e9e9118c63fc010000de89c800e1118c63; ddscreen=2; dest_area=country_id=9000&province_id=111&city_id =0&district_id=0&town_id=0; __visit_id=20221204112010093339809160925577017; __out_refer=; __trace_id=20221204112010094534951493076142788; pos_9_end=1670124010185; pos_0_start=1670124010224; pos_0_end=1670124010229; ad_ids=2095975|#1'
}


def spider(keyword, page):
    for i in range(1, page):

      url = 'http://search.dangdang.com/?key=' + str(keyword) + '&page_index=' + str(i)
      print(url)
      res = requests.get(url, headers=headers,timeout=2)
      html=res.text



      for a in range(1,60):

          lis=re.findall(r'<li ddt-pit="'+str(a)+'"(.*?)>(.*?)</li>',html,re.S)
          title=re.findall(r'<a title="(.*?)"',str(lis),re.S)    #书名
          price=re.findall(r'_now_price">.*?(\d+\D\d+)',str(lis),re.S)   #价格
          author1=re.findall(r'<p class="search_book_author">(.*?)</p>',str(lis),re.S)





          try:
            author = re.findall(r'2=(.*?)&medium', str(author1), re.S)

            pub3 = re.findall(r'P_cbs.*?>(.*?)</a', str(author1), re.S)
            with open("dsj_2002_20200126057_.csv", "a", encoding='utf_8_sig') as f2:
                f2.writelines(title + "," + str(price) + "," + str(author) + "," + str(pub3) + '\n')

                print('\n书籍:', title, '\n价格:', price, '\n作者:', author, '\n出版社:', pub3)



          except:
            pub1 = re.findall(r'dd_name.*?t.*?>.*?社?<', str(author1), re.S)
            pub = re.findall(r'>(.*?)<', str(pub1), re.S)
            with open("dsj_2002_20200126057_csv", "a", encoding='utf_8_sig') as f2:
                  f2.writelines(title + "," + str(price) + "," + '佚名' + "," + str(pub) + '\n')

                  print('\n书籍:', title, '\n价格:', price, '\n作者:', '佚名', '\n出版社:', pub)







if __name__ == '__main__':
    keyword = 'python爬虫'
    page = 7
    spider(keyword, page)
页: [1]
查看完整版本: 正则表达式爬取当当网并保存为csv