Douyin Hot Trending Data Scraper and Video Downloader


 

import os
import json
import datetime
import requests
import bs4 as bs
import numpy as np
import pandas as pd
import urllib.request
# from IPython.core.display import HTML
def generate_path(path):
    folder = os.path.exists(path)
    if not folder:
        os.makedirs(path)
def video(video_url,file_name):
    return urllib.request.urlretrieve(video_url,file_name)
    
def cover(cover_url,file_name):
    return urllib.request.urlretrieve(cover_url,file_name)
def time(timestamp):
    return str(datetime.datetime.fromtimestamp(timestamp))
api = requests.get('https://raw.githubusercontent.com/xjincomm/Douyin/master/Douyin%20Trending%20API.txt').text
re=requests.get(api)
soup=bs.BeautifulSoup(re.content,'html.parser')
last_update = json.loads(soup.text)['data']['active_time']
trending_data = json.loads(soup.text)['data']['word_list']
trend = pd.DataFrame(trending_data)
date=last_update.split(' ')[0]
Word_cover=[]
for i in trend['word_cover']:
    if type(i)==dict:
        Word_cover.append(i['url_list'][0])
    else:
        Word_cover.append(None)
trend = trend.drop(columns = ['word_cover','challenge_id'])
# word means the title of the topic
generate_path('./trend')
#trend['cover']=['<img src="'+ str(i) + '" width="60" >' for i in Word_cover]
#trend_visual = HTML(trend.head(3).to_html(escape=False ,formatters=trend['cover']))
#trend.to_html('./trend/trend_'+last_update+'.html', escape=False)
trend.to_csv('./trend/trend_'+date+'.csv',encoding = 'utf-8-sig', index = False)
#trend_visual
def scraper(topic):
    generate_path('./'+topic)
    topic_api='https://aweme-hl.snssdk.com/aweme/v1/hot/search/video/list/?hotword='
    re=requests.get(topic_api+topic)
    soup=bs.BeautifulSoup(re.content,'html.parser')
    data = json.loads(soup.text)
    data = data['aweme_list']
    desc = [info['desc'] for info in data]
    time_stamp = [info['create_time'] for info in data]
    create_time = [time(info['create_time']) for info in data]
    nickname = [info['author']['nickname'] for info in data]
    verify = [info['author']['custom_verify'] for info in data]
    share_count = [info['statistics']['share_count'] for info in data]
    forward_count = [info['statistics']['forward_count'] for info in data]
    like_count = [info['statistics']['digg_count'] for info in data]
    comment_count = [info['statistics']['comment_count'] for info in data]
    download_count = [info['statistics']['download_count'] for info in data]
    cover_url = [info['video']['cover']['url_list'][0] for info in data]
    cover_visual = ['<img src="'+ url + '" width="100" >' for url in cover_url]
    video_url = []
    for info in data:
        try:
            video_url.append([i for i in info['video']['play_addr']['url_list']][0])
        except:
            video_url.append(None)
    df=pd.DataFrame({'desc':desc,'nickname':nickname,'verify':verify,'time_stamp':time_stamp,
                     'create_time':create_time,'share_count':share_count,'forward_count':forward_count,
                    'like_count':like_count,'comment_count':comment_count,
                     'download_count':download_count,'video_url':video_url,
                    'cover_visual':cover_visual})
    df.to_csv('./'+topic+'/'+topic+'.csv',encoding='utf-8-sig',index=False)
    #df.to_html('./'+topic+'/'+topic+'.html',escape=False)
    #video_visual = HTML(df.to_html(escape=False ,formatters=df['cover_visual']))
    for num in range(0,len(data)):
        try:
            video(df['video_url'][num],'./'+topic+'/'+str(df['time_stamp'][num])+'.mp4')
            
            print('topic: '+topic+', video #'+str(num)+': '+str(df['time_stamp'][num])+'......Succeeded')
        except:
            print('topic: '+topic+', video #'+str(num)+': '+str(df['time_stamp'][num])+'......Failed')
            continue
def douyin_trend():
    for i in trend['word']:
        scraper(i)
def douyin_topic(topic):
    scraper(topic)

إرسال تعليق

Post a Comment (0)

أحدث أقدم