无聊去测试了下用Python去获取下信息,经过分析,亚马逊需要cookie才能正常访问,而cookie的关键信息在于session-id
,但是这个session-id
是由一个js生成的,我的能力暂时还没强大到逆向js,所以就投机取巧,用了selenium
来模拟访问并拿到session-id
,废话不多说,贴代码---
部分注释是用于测试使用的,可忽略......
----此代码仅供学习使用,不能用于其他用途-------
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File : test_amazon.py
# Author: 违心唯心
# Date : 2022/3/14
import requests
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--headless") #无头模式,即不显示浏览器
driver = Chrome(r'F:\PythonEnv\teduenv\tedu_exercise\exercise\biao\chromedriver.exe',options=options) # 记得修改你的chromedriver.exe 位置
# driver.maximize_window()
driver.set_page_load_timeout(30)
url = 'https://www.amazon.cn/'
driver.get(url=url)
# print(driver.get_cookies())
cookie = driver.get_cookies()
session_id = ''
for i in cookie:
# print(type(i['secure']))
if i['secure'] == True:
# print(i['value'])
session_id = i['value']
break
# session_id = driver.get_cookies()[-1]['value']
driver.quit()
'''
图片URL:
//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]
名字:
//span[@class="a-size-base-plus a-color-base a-text-normal"]
价格:
//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]
网址:
https://www.amazon.cn/s?i=computers&rh=n%3A888491051&fs=true&page=3
最大页数:
//span[@class="s-pagination-item s-pagination-selected"]
'''
headers = {
'host':'www.amazon.cn',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
'referer':'https://www.amazon.cn/b?ie=UTF8&node=888491051&ref_=sa_menu_office_l3_b106246071',
## session-id 你自己找~~~~~~
## session-id 你自己找~~~~~~
'cookie':'session-id={};'.format(session_id),
}
url = 'https://www.amazon.cn/s?i=computers&rh=n%3A888491051&fs=true&page={}'
#
# res = requests.get(url=url,headers=headers)
# # print(res.text)
#
# html = etree.HTML(res.text)
#
# max_page = html.xpath(r'//span[@class="s-pagination-item s-pagination-selected"]/text()')[0]
#
# pic_urls = html.xpath(r'//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]/@src')
#
# name_list= html.xpath(r'//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
#
# price_list = html.xpath(r'//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]/text()')
#
# # print(len(pic_urls),'\r\n',len(name_list),'\r\n',len(price_list))
#
# count = 0
# for i,j,k in zip (pic_urls,name_list,price_list):
# l=k.replace('¥','').replace(',','')
# # print(i,j,l)
# w_str = '{},{},{}\n'.format(j,l,i)
# with open('test.csv','a+',encoding='utf-8') as fw:
# fw.write(w_str)
#
#
def get_maxpage(url):
res = requests.get(url=url, headers=headers)
html = etree.HTML(res.text)
max_page = html.xpath(r'//span[@class="s-pagination-item s-pagination-disabled"]/text()')[0]
return max_page
def get_content(url):
contentlist = []
res = requests.get(url=url, headers=headers)
html = etree.HTML(res.text)
pic_urls = html.xpath(r'//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]/@src')
name_list = html.xpath(r'//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
price_list = html.xpath(r'//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]/text()')
for i, j, k in zip(pic_urls, name_list, price_list):
l = k.replace('¥', '').replace(',', '')
# print(i,j,l)
w_str = '{},{},{}\n'.format(j.replace(' ','').replace(',','').replace(',',''), l, i)
# print(w_str)
contentlist.append(w_str)
return contentlist
def save(contentlist):
for wstr in contentlist:
with open('f:\\test_amazon.csv','a+',encoding='utf-8') as fw:
fw.write(wstr)
if __name__ == '__main__':
# pass
max_page = get_maxpage(url.format(1))
c_list = []
for i in range(1,int(max_page)+1):
contentlist = get_content(url.format(i))
c_list.append(contentlist)
break
for k in c_list:
save(k)
print('done')