Compare commits

...

4 Commits

Author SHA1 Message Date
Александр Геннадьевич Сальный
40ed31f5be remove venv 2022-10-15 21:56:31 +03:00
Александр Геннадьевич Сальный
37771ebd31 add some code 2022-10-15 21:54:50 +03:00
Александр Геннадьевич Сальный
2e04cc9061 Merge branch 'main' of https://git.danamir.ru/danamir/PinterestParse 2022-10-15 21:53:33 +03:00
Александр Геннадьевич Сальный
7caeeaaff5 second commit 2022-10-15 21:01:12 +03:00
3 changed files with 91 additions and 0 deletions

5
.gitignore vendored
View File

@@ -160,3 +160,8 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
driver
html
venv
*.log
imgs

64
main.py Normal file
View File

@@ -0,0 +1,64 @@
from fake_useragent import UserAgent
from selenium import webdriver
from selenium.webdriver import Firefox
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import time
from datetime import date, datetime
import os
import codecs
now = datetime.now()
formatingDate = now.strftime("%d-%m-%Y-%H-%M")
basePath = "D:\Programming\PythonProgects\PinterestParse\html"
ua = UserAgent(use_cache_server=False)
#useragent="Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:100.0) Gecko/20100101 Firefox/100.0"
#useragent=ua
firefoxProfie = r'C:\\Users\\danamir.ASUMADI\\AppData\\Roaming\\Mozilla\\Firefox\\Profiles\\9vfgbpct.default'
options=Options()
options.set_preference('profile',firefoxProfie)
driverPath = os.getcwd()+'\driver\geckodriver.exe'
service = Service(driverPath)
browser = Firefox(service=service, options=options)
url = "https://ru.pinterest.com/alexdanamir/%D1%87%D0%B5%D1%80%D1%82%D0%B5%D0%B6%D0%B8/"
browser.get(url)
SCROLL_PAUSE_TIME = 5
# Get scroll height
last_height = browser.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
time.sleep(SCROLL_PAUSE_TIME)
# Calculate new scroll height and compare with last scroll height
new_height = browser.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
html = browser.page_source
time.sleep(2)
htmlName = "pinterest" + "-" + formatingDate + ".txt"
saveFile = os.path.join(basePath, htmlName)
savedFile = open(saveFile, "w")
savedFile.write(html)
savedFile.close()
browser.close()

22
parsehtml.py Normal file
View File

@@ -0,0 +1,22 @@
import re
import os
import logging
def main():
logging.basicConfig(filename='app.log', level=logging.DEBUG)
basePath = "D:\Programming\PythonProgects\PinterestParse\html"
fileForParse = os.path.join(basePath,os.listdir(basePath)[0])
logging.debug(fileForParse)
openedFile = open(fileForParse, 'r')
logging.debug(openedFile.read().decode('cp-1251'))
if __name__ =='__main__':
main()