vvanglro/cf-clearance

cf勾选了一次

jw-star opened this issue · 10 comments

页面需要两次勾选

import asyncio
from urllib import parse

from cf_clearance import async_stealth, async_cf_retry
from playwright.async_api import async_playwright

browser = None
p = None
cookies = None
ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'


async def init_browser():
    global browser
    global p
    if browser is None:
        p = await async_playwright().start()
        browser = await p.chromium.launch(headless=False,
                                          channel='chrome',
                                          args=["--no-sandbox"]
                                          )

    context = await browser.new_context(
        viewport={'width': 1920, 'height': 1080},
        locale='zh-CN',
        user_agent=ua
    )
    await context.add_init_script('''
    const elementDescriptor = Object.getOwnPropertyDescriptor(HTMLElement.prototype, 'offsetHeight');
    Object.defineProperty(HTMLDivElement.prototype, 'offsetHeight', {
        ...elementDescriptor,
        get: function() {
            if (this.id === 'modernizr') {
                return 1;
            }
            return elementDescriptor.get.apply(this);
        },
    });
        ''')
    page = await context.new_page()
    return context, page


# 获取详情页
async def getCableInfo(url):
    context, page = await init_browser()
    m3u8Url = ''
    title = ''
    try:
        global cookies
        if cookies is not None:
            await context.add_cookies(cookies)

        await async_stealth(page, pure=True)
        await page.goto(url, timeout=1000 * 60 * 4)
        res = await async_cf_retry(page)
        if res:
            cookiess = await context.cookies()
            for cookie in cookiess:
                if cookie.get('name') == 'cf_clearance':
                    cf_clearance_value = cookie.get('value')
                    print(cf_clearance_value)
                    cookies = cookiess
            ua = await page.evaluate('() => {return navigator.userAgent}')
            print(ua)
        else:
            print("cf challenge fail")
            return None, None
        await asyncio.sleep(5)
        viewkey = parse.urlparse(url).path.replace('/', '')
        await page.screenshot(path=f'{viewkey}/screen.jpg', quality=30, type='jpeg')
        title = await page.evaluate('''() => {
               return document.querySelector("head > title").text
            }''')

        m3u8Url = await page.evaluate('''() => {
                       return vidorev_jav_js_object
                    }''')
        m3u8Url = m3u8Url['single_media_sources'][-1]['source_file']
        # print(cookies)
        print(f'页面的m3u8 {m3u8Url}')

    except Exception as e:
        print(e)

    finally:
        await context.close()
    return m3u8Url, title


from tenacity import retry, stop_after_attempt, wait_fixed


# 单页urls
async def one_page(url):
    context, page = await init_browser()
    try:
        await async_stealth(page, pure=True)
        await page.goto(url, wait_until='load')
        res = await async_cf_retry(page)
        if res:
            cookiess = await context.cookies()
            for cookie in cookiess:
                if cookie.get('name') == 'cf_clearance':
                    cf_clearance_value = cookie.get('value')
                    print(cf_clearance_value)
                    global cookies
                    cookies = cookiess
            ua = await page.evaluate('() => {return navigator.userAgent}')
            print(ua)
        else:
            print("cf challenge fail")
            # 抛出异常,重试一次
            raise RuntimeError('重试后依然没有绕过cf')
        await page.wait_for_timeout(1000 * 6)
        urls = await page.eval_on_selector_all(
            'article > div > div.listing-content > h3 > a',
            'nodes => nodes.map(node => node.href)')
        titles = await page.eval_on_selector_all(
            'article > div > div.listing-content > h3 > a',
            'nodes => nodes.map(node => node.title)')
        durations = await page.eval_on_selector_all(
            'article > div > div.blog-pic > div > span:last-child',
            'nodes => nodes.map(node => node.textContent)')

        print(urls)
    finally:
        await context.close()
    return urls, titles, durations


async def main():
    await getCableInfo('https://cableav.tv/sOfm8fgQmdg/')
    # await one_page('https://cableav.tv/category/selfie-porn/')


if __name__ == '__main__':
    asyncio.get_event_loop().run_until_complete(main())

不要用channel='chrome', 使用默认的Chromium不会出现需要2次的情况.

好的,我试试,谢谢

改位Chromium在linux上运行还是不通过验证

改为Chromium在linux上运行还是不通过验证,windows确实可以 @vvanglro

麻烦仔细看readme.

是使用了XVFB环境的,之前没出过这个问题,之前使用channel='chrome' 可以的,可能是cf改东西了,现在绕不过去

我再看下吧,确实很奇怪,现在没招到问题原因,谢谢

@jw-star 用firefox吧, 这个无头模式可以过. 就不需要async_stealth方法了, 但是还是要用async_cf_retry.

好的