Compare commits
4 Commits
d2a9c700d9
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
40ed31f5be | ||
|
|
37771ebd31 | ||
|
|
2e04cc9061 | ||
|
|
7caeeaaff5 |
5
.gitignore
vendored
5
.gitignore
vendored
@@ -160,3 +160,8 @@ cython_debug/
|
|||||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||||
#.idea/
|
#.idea/
|
||||||
|
|
||||||
|
driver
|
||||||
|
html
|
||||||
|
venv
|
||||||
|
*.log
|
||||||
|
imgs
|
||||||
|
|||||||
64
main.py
Normal file
64
main.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
from fake_useragent import UserAgent
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver import Firefox
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.common.keys import Keys
|
||||||
|
from selenium.webdriver.firefox.service import Service
|
||||||
|
from selenium.webdriver.firefox.options import Options
|
||||||
|
import time
|
||||||
|
from datetime import date, datetime
|
||||||
|
import os
|
||||||
|
import codecs
|
||||||
|
|
||||||
|
|
||||||
|
now = datetime.now()
|
||||||
|
formatingDate = now.strftime("%d-%m-%Y-%H-%M")
|
||||||
|
|
||||||
|
basePath = "D:\Programming\PythonProgects\PinterestParse\html"
|
||||||
|
|
||||||
|
ua = UserAgent(use_cache_server=False)
|
||||||
|
#useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0"
|
||||||
|
#useragent=ua
|
||||||
|
|
||||||
|
firefoxProfie = r'C:\\Users\\danamir.ASUMADI\\AppData\\Roaming\\Mozilla\\Firefox\\Profiles\\9vfgbpct.default'
|
||||||
|
options=Options()
|
||||||
|
options.set_preference('profile',firefoxProfie)
|
||||||
|
driverPath = os.getcwd()+'\driver\geckodriver.exe'
|
||||||
|
service = Service(driverPath)
|
||||||
|
browser = Firefox(service=service, options=options)
|
||||||
|
|
||||||
|
url = "https://ru.pinterest.com/alexdanamir/%D1%87%D0%B5%D1%80%D1%82%D0%B5%D0%B6%D0%B8/"
|
||||||
|
|
||||||
|
browser.get(url)
|
||||||
|
|
||||||
|
|
||||||
|
SCROLL_PAUSE_TIME = 5
|
||||||
|
|
||||||
|
# Get scroll height
|
||||||
|
last_height = browser.execute_script("return document.body.scrollHeight")
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# Scroll down to bottom
|
||||||
|
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
||||||
|
|
||||||
|
# Wait to load page
|
||||||
|
time.sleep(SCROLL_PAUSE_TIME)
|
||||||
|
|
||||||
|
# Calculate new scroll height and compare with last scroll height
|
||||||
|
new_height = browser.execute_script("return document.body.scrollHeight")
|
||||||
|
if new_height == last_height:
|
||||||
|
break
|
||||||
|
last_height = new_height
|
||||||
|
|
||||||
|
html = browser.page_source
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
htmlName = "pinterest" + "-" + formatingDate + ".txt"
|
||||||
|
saveFile = os.path.join(basePath, htmlName)
|
||||||
|
|
||||||
|
savedFile = open(saveFile, "w")
|
||||||
|
savedFile.write(html)
|
||||||
|
savedFile.close()
|
||||||
|
browser.close()
|
||||||
|
|
||||||
22
parsehtml.py
Normal file
22
parsehtml.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
import re
|
||||||
|
import os
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
logging.basicConfig(filename='app.log', level=logging.DEBUG)
|
||||||
|
|
||||||
|
basePath = "D:\Programming\PythonProgects\PinterestParse\html"
|
||||||
|
|
||||||
|
fileForParse = os.path.join(basePath,os.listdir(basePath)[0])
|
||||||
|
|
||||||
|
logging.debug(fileForParse)
|
||||||
|
|
||||||
|
openedFile = open(fileForParse, 'r')
|
||||||
|
|
||||||
|
logging.debug(openedFile.read().decode('cp-1251'))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ =='__main__':
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user