我覺得我生活在這世上二十多年裡,去過最多的餐廳就是肯德基小時候逢生日必去,現在長大了,肯德基成了我的日常零食下班後從門前路過餓了便會進去點分黃金雞塊或者小吃拼盤早上路過,會買杯咖啡。主要快捷美味且飽腹,而且到處都是總是會路過,現在只要一餓,心心念念便是肯德基的味道 環境介紹 python 3.6 p ...
我覺得我生活在這世上二十多年裡,去過最多的餐廳就是肯德基小時候逢生日必去,現在長大了,肯德基成了我的日常零食下班後從門前路過餓了便會進去點分黃金雞塊或者小吃拼盤早上路過,會買杯咖啡。主要快捷美味且飽腹,而且到處都是總是會路過,現在只要一餓,心心念念便是肯德基的味道
環境介紹
python 3.6
pycharm
requests
csv
爬蟲的一般思路
1、確定爬取的url路徑,headers參數
2、發送請求 -- requests 模擬瀏覽器發送請求,獲取響應數據
3、解析數據
4、保存數據
步驟
1、確定爬取的url路徑,headers參數
先爬取北京的數據
base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'} data = { 'cname': '', 'pid': '', 'keyword': '北京', 'pageIndex': '1', 'pageSize': '10', }
2、發送請求 -- requests 模擬瀏覽器發送請求,獲取響應數據
response = requests.post(url=base_url, headers=headers, data=data) json_data = response.json() # pprint.pprint(json_data)
3、解析數據
data_list = json_data['Table1'] # pprint.pprint(data_list) # 構建迴圈,解析數據欄位 for ls in data_list: storeName = ls['storeName'] + '餐廳' # 餐廳名稱 cityName = ls['cityName'] # 餐廳城市 addressDetail = ls['addressDetail'] # 餐廳地址 pro = ls['pro'] # 餐廳詳情 # print(storeName, cityName, addressDetail, pro)
4、保存數據
print('正在爬取:', storeName) with open('data.csv', 'a', newline='') as csvfile: # newline='' 指定一行一行寫入 csvwriter = csv.writer(csvfile, delimiter=',') # delimiter=',' csv數據的分隔符 csvwriter.writerow([storeName, cityName, addressDetail, pro]) # 序列化數據,寫入csv
5、全國315個城市的數據
獲取拉勾網315個城市的數據
# coding:utf-8 import requests import csv import time import random ip = [{'HTTP': '1.199.31.213:9999'}, {'HTTP': '182.46.197.33:9999'}, {'HTTP': '58.18.133.101:56210'}, {'HTTP': '175.44.108.123:9999'}, {'HTTP': '123.52.97.90:9999'}, {'HTTP': '182.92.233.137:8118'}, {'HTTP': '223.242.225.42:9999'}, {'HTTP': '113.194.28.84:9999'}, {'HTTP': '113.194.30.115:9999'}, {'HTTP': '113.195.19.41:9999'}, {'HTTP': '144.123.69.123:9999'}, {'HTTP': '27.192.168.202:9000'}, {'HTTP': '163.204.244.179:9999'}, {'HTTP': '112.84.53.197:9999'}, {'HTTP': '117.69.13.69:9999'}, {'HTTP': '1.197.203.214:9999'}, {'HTTP': '125.108.111.22:9000'}, {'HTTP': '171.35.169.69:9999'}, {'HTTP': '171.15.173.234:9999'}, {'HTTP': '171.13.103.52:9999'}, {'HTTP': '183.166.97.201:9999'}, {'HTTP': '60.2.44.182:44990'}, {'HTTP': '58.253.158.21:9999'}, {'HTTP': '47.94.89.87:3128'}, {'HTTP': '60.13.42.235:9999'}, {'HTTP': '60.216.101.46:32868'}, {'HTTP': '117.90.137.91:9000'}, {'HTTP': '123.169.164.163:9999'}, {'HTTP': '123.169.162.230:9999'}, {'HTTP': '125.108.119.189:9000'}, {'HTTP': '163.204.246.68:9999'}, {'HTTP': '223.100.166.3:36945'}, {'HTTP': '113.195.18.134:9999'}, {'HTTP': '163.204.245.50:9999'}, {'HTTP': '125.108.79.50:9000'}, {'HTTP': '163.125.220.205:8118'}, {'HTTP': '1.198.73.246:9999'}, {'HTTP': '175.44.109.51:9999'}, {'HTTP': '121.232.194.47:9000'}, {'HTTP': '113.194.30.27:9999'}, {'HTTP': '129.28.183.30:8118'}, {'HTTP': '123.169.165.73:9999'}, {'HTTP': '120.83.99.190:9999'}, {'HTTP': '175.42.128.48:9999'}, {'HTTP': '123.101.212.223:9999'}, {'HTTP': '60.190.250.120:8080'}, {'HTTP': '125.94.44.129:1080'}, {'HTTP': '118.112.195.91:9999'}, {'HTTP': '110.243.5.163:9999'}, {'HTTP': '118.89.91.108:8888'}, {'HTTP': '125.122.199.13:9000'}, {'HTTP': '171.11.28.248:9999'}, {'HTTP': '211.152.33.24:39406'}, {'HTTP': '59.62.35.130:9000'}, {'HTTP': '123.163.96.124:9999'}] def get_page(keyword): global base_url base_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' global headers headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'} data = { 'cname': '', 'pid': '', 'keyword': keyword, 'pageIndex': '1', 'pageSize': '10', } try: response = requests.post(url=base_url, headers=headers, data=data) json_data = response.json() page = json_data['Table'][0]['rowcount'] if page % 10 > 0: page_num = page // 10 + 1 else: page_num = page // 10 return page_num except Exception as e: print(e) def send_request(keyword): page_num = get_page(keyword) try: for page in range(1, page_num + 1): print('============正在獲取第{}頁信息=========='.format(page)) data = { 'cname': '', 'pid': '', 'keyword': keyword, 'pageIndex': str(page), 'pageSize': '10', } response = requests.post(url=base_url, headers=headers, data=data, proxies=random.choice(ip),timeout=3) json_data = response.json() # pprint.pprint(json_data) time.sleep(0.4) # 3、解析數據 data_list = json_data['Table1'] # pprint.pprint(data_list) # 構建迴圈,解析數據欄位 for ls in data_list: storeName = ls['storeName'] + '餐廳' # 餐廳名稱 cityName = ls['cityName'] # 餐廳城市 addressDetail = ls['addressDetail'] # 餐廳地址 pro = ls['pro'] # 餐廳詳情 # print(storeName, cityName, addressDetail, pro) # 4、保存數據 print('正在爬取:', storeName) with open('data5.csv', 'a', newline='') as csvfile: # newline='' 指定一行一行寫入 csvwriter = csv.writer(csvfile, delimiter=',') # delimiter=',' csv數據的分隔符 csvwriter.writerow([storeName, cityName, addressDetail, pro]) # 序列化數據,寫入csv time.sleep(0.2) except Exception as e: print(e) if __name__ == '__main__': # https://www.lagou.com/lbs/getAllCitySearchLabels.json all_cities = ['安陽', '安慶', '鞍山', '澳門特別行政區', '安順', '阿勒泰', '安康', '阿克蘇', '阿壩藏族羌族自治州', '阿拉善盟', '北京', '保定', '蚌埠', '濱州', '包頭', '寶雞', '北海', '亳州', '百色', '畢節', '巴中', '本溪', '巴音郭楞', '巴彥淖爾', '博爾塔拉', '保山', '白城', '白山', '成都', '長沙', '重慶', '長春', '常州', '滄州', '赤峰', '郴州', '潮州', '常德', '朝陽', '池州', '滁州', '承德', '昌吉', '楚雄', '崇左', '東莞', '大連', '德州', '德陽', '大慶', '東營', '大同', '達州', '大理', '德巨集', '丹東', '定西', '儋州', '迪慶', '鄂州', '恩施', '鄂爾多斯', '佛山', '福州', '阜陽', '撫州', '撫順', '阜新', '防城港', '廣州', '貴陽', '桂林', '贛州', '廣元', '貴港', '廣安', '固原', '甘孜藏族自治州', '杭州', '合肥', '惠州', '哈爾濱', '海口', '呼和浩特', '邯鄲', '衡陽', '湖州', '淮安', '海外', '菏澤', '衡水', '河源', '懷化', '黃岡', '黃石', '黃山', '淮北', '淮南', '葫蘆島', '呼倫貝爾', '漢中', '紅河', '賀州', '河池', '鶴壁', '鶴崗', '海東', '哈密', '濟南', '金華', '嘉興', '濟寧', '江門', '晉中', '吉林', '九江', '揭陽', '焦作', '荊州', '錦州', '荊門', '吉安', '景德鎮', '晉城', '佳木斯', '酒泉', '濟源', '昆明', '開封', '克拉瑪依', '喀什', '蘭州', '臨沂', '廊坊', '洛陽', '柳州', '六安', '聊城', '連雲港', '呂梁', '瀘州', '拉薩', '麗水', '樂山', '龍岩', '臨汾', '漯河', '六盤水', '涼山彞族自治州', '麗江', '婁底', '萊蕪', '遼源', '隴南', '臨夏', '來賓', '綿陽', '茂名', '馬鞍山', '梅州', '牡丹江', '眉山', '南京', '寧波', '南昌', '南寧', '南通', '南陽', '南充', '寧德', '南平', '內江', '莆田', '濮陽', '萍鄉', '平頂山', '盤錦', '攀枝花', '平涼', '普洱', '青島', '泉州', '清遠', '秦皇島', '曲靖', '衢州', '齊齊哈爾', '黔西南', '黔南', '欽州', '黔東南', '慶陽', '七台河', '日照', '深圳', '上海', '蘇州', '沈陽', '石家莊', '紹興', '汕頭', '宿遷', '商丘', '三亞', '上饒', '宿州', '邵陽', '十堰', '遂寧', '韶關', '三門峽', '汕尾', '隨州', '三沙', '三明', '綏化', '石嘴山', '四平', '朔州', '商洛', '松原', '天津', '太原', '唐山', '台州', '泰安', '泰州', '天水', '通遼', '銅陵', '臺灣', '銅仁', '銅川', '鐵嶺', '塔城', '天門', '通化', '武漢', '無錫', '溫州', '濰坊', '烏魯木齊', '蕪湖', '威海', '梧州', '渭南', '吳忠', '烏蘭察布', '文山', '烏海', '西安', '廈門', '徐州', '新鄉', '西寧', '咸陽', '許昌', '邢台', '孝感', '襄陽', '香港特別行政區', '湘潭', '信陽', '忻州', '咸寧', '宣城', '西雙版納', '湘西土家族苗族自治州