无聊去测试了下用Python去获取下信息,经过分析,亚马逊需要cookie才能正常访问,而cookie的关键信息在于session-id,但是这个session-id 是由一个js生成的,我的能力暂时还没强大到逆向js,所以就投机取巧,用了selenium来模拟访问并拿到session-id,废话不多说,贴代码---

部分注释是用于测试使用的,可忽略......

----此代码仅供学习使用,不能用于其他用途-------

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# File  : test_amazon.py
# Author: 违心唯心
# Date  : 2022/3/14


import requests
from lxml import etree
from selenium.webdriver import Chrome
from selenium.webdriver import ChromeOptions

options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
options.add_argument("--headless") #无头模式,即不显示浏览器

driver = Chrome(r'F:\PythonEnv\teduenv\tedu_exercise\exercise\biao\chromedriver.exe',options=options)  # 记得修改你的chromedriver.exe 位置
# driver.maximize_window()
driver.set_page_load_timeout(30)

url = 'https://www.amazon.cn/'
driver.get(url=url)
# print(driver.get_cookies())
cookie = driver.get_cookies()
session_id = ''
for i in cookie:
    # print(type(i['secure']))
    if i['secure'] == True:
        # print(i['value'])
        session_id = i['value']
        break

# session_id = driver.get_cookies()[-1]['value']

driver.quit()



'''
图片URL:
//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]

名字:
//span[@class="a-size-base-plus a-color-base a-text-normal"]

价格:
//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]


网址: 
https://www.amazon.cn/s?i=computers&rh=n%3A888491051&fs=true&page=3


最大页数:
//span[@class="s-pagination-item s-pagination-selected"]


'''


headers = {
    'host':'www.amazon.cn',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36',
    'referer':'https://www.amazon.cn/b?ie=UTF8&node=888491051&ref_=sa_menu_office_l3_b106246071',
    ## session-id   你自己找~~~~~~
    ## session-id   你自己找~~~~~~
    'cookie':'session-id={};'.format(session_id),
}

url = 'https://www.amazon.cn/s?i=computers&rh=n%3A888491051&fs=true&page={}'
#
# res = requests.get(url=url,headers=headers)
# # print(res.text)
#
# html = etree.HTML(res.text)
#
# max_page = html.xpath(r'//span[@class="s-pagination-item s-pagination-selected"]/text()')[0]
#
# pic_urls = html.xpath(r'//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]/@src')
#
# name_list= html.xpath(r'//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
#
# price_list = html.xpath(r'//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]/text()')
#
# # print(len(pic_urls),'\r\n',len(name_list),'\r\n',len(price_list))
#
# count = 0
# for i,j,k in zip (pic_urls,name_list,price_list):
#     l=k.replace('¥','').replace(',','')
#     # print(i,j,l)
#     w_str = '{},{},{}\n'.format(j,l,i)
#     with open('test.csv','a+',encoding='utf-8') as fw:
#         fw.write(w_str)
#
#


def get_maxpage(url):
    res = requests.get(url=url, headers=headers)
    html = etree.HTML(res.text)
    max_page = html.xpath(r'//span[@class="s-pagination-item s-pagination-disabled"]/text()')[0]
    return max_page


def get_content(url):
    contentlist = []
    res = requests.get(url=url, headers=headers)
    html = etree.HTML(res.text)
    pic_urls = html.xpath(r'//div[@class="a-section aok-relative s-image-square-aspect"]/img[@class="s-image"]/@src')
    name_list = html.xpath(r'//span[@class="a-size-base-plus a-color-base a-text-normal"]/text()')
    price_list = html.xpath(r'//div[@class="a-row a-size-base a-color-base"]/a/span/span[@class="a-offscreen"]/text()')
    for i, j, k in zip(pic_urls, name_list, price_list):
        l = k.replace('¥', '').replace(',', '')
        # print(i,j,l)
        w_str = '{},{},{}\n'.format(j.replace(' ','').replace(',','').replace(',',''), l, i)
        # print(w_str)
        contentlist.append(w_str)
    return contentlist

def save(contentlist):
    for wstr  in contentlist:
        with open('f:\\test_amazon.csv','a+',encoding='utf-8') as fw:
         fw.write(wstr)





if __name__ == '__main__':
    # pass
    max_page = get_maxpage(url.format(1))
    c_list = []
    for i in range(1,int(max_page)+1):
        contentlist = get_content(url.format(i))
        c_list.append(contentlist)
        break
    for k in c_list:
        save(k)
    print('done')


上一篇