반응형

예전 방식이라 잘되지 않는다. 

다른 url로 했을때 데이터는 잘 가져왔다. 

css 보안이 걸려있는듯. 

 

##  인스타그램 이미지 크롤링
#    
##
import os
import sys
import konlpy
import pandas as pd
import numpy as np
os.environ['JAVA_OPTS'] = 'Xmx4096M'
    
## 시간 표시  ##################################### 
import time
import datetime
now = datetime.datetime.now()

timeserise = time.time()
timeserise = str(int(timeserise))
print(timeserise)
print(now)
#################################################  


#작업하는 경로(위치)가 어디인지 확인
print(os.getcwd())

prePath = "./Project/instagram_cr/"
file_name = prePath + "outputfile0.txt" 

# 라이브러리 추가
from bs4 import BeautifulSoup  #불러온 데이터를 구분지어 원라는 데이터 출력
from selenium import webdriver #Chromedriver를 사용하여, 자동화 시스템 구동
## chrome 버전 안맞으면 아래와 같은 에러 발생함. chromedriver 버전 확인 필수
#  selenium.common.exceptions.SessionNotCreatedException: Message: session not created: This version of ChromeDriver only supports Chrome version 94
# Current browser version is 105.0.5195.102 with binary path C:\Program Files\Google\Chrome\Application\chrome.exe
# 
# GoUrl : https://chromedriver.storage.googleapis.com/index.html?path=105.0.5195.52/
##


from urllib.request import urlopen
from urllib.parse  import quote_plus # ASCII 형태로 자동 변형
import requests
import shutil
 


testurl_01 = "https://www.instagram.com/explore/tags/"
testurl_02 = input("Please input the word to search for : ")
testurl_03 = testurl_01 + quote_plus(testurl_02)


print(testurl_03)

## 아래 오류때문에 추가함. options
#  USB: usb_device_handle_win.cc:1048 Failed to read descriptor from node connection: 시스템에 부착된 장치가 작동하지 않습니다. 
## options start
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-logging"])
#browser = webdriver.Chrome(options=options)
## options end 

#driver_01 = webdriver.Chrome()
driver_01 = webdriver.Chrome(options=options)
driver_01.get(testurl_03)

html_01 = driver_01.page_source
#print(html_01)

Source_01 = BeautifulSoup(html_01,"html.parser")
#Source_01 = BeautifulSoup(html_01,"lxml")
#Source_01 = BeautifulSoup(html_01)

time.sleep(5)

#print(Source_01)
print(Source_01.prettify())
 
o = open(prePath +'result_list.txt', 'w', encoding='utf-8')
o.write("")
o.write(Source_01.prettify())
o.close()    




var_list = [1, 3, 5, 7, 9]
for ii in var_list:
    print("----------------------------------------")


Demo_insta = Source_01.select('._a3wf._-kb.segoe') 
print(Demo_insta)

for each_div in Source_01.findAll('div',{'class':'list'}):
    print(each_div)


"""
x_1 = 1

for i in Demo_insta:
    print("https://www.instagram.com/" + i.a['href'])
    #img_01 = i.select_one('_aagt').img['src']
    #print(img_01)
"""

driver_01.close()
반응형

+ Recent posts