<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
</head>
<body text="#000000" bgcolor="#FFFFFF">
<p>Barry,</p>
<p>Thanks for the reply and assistance in solving this issue.<br>
</p>
<p>As you suggested, I tried to remove all dependencies for Anaconda
and Spyder, and started to hit some more brick walls with
versionitis of PyQt5.</p>
<p>So here we go.</p>
<ul>
<li>I created a virtualenv right on my Ubuntu system</li>
<li>When I installed various packages using PIP I could not
install PyQt5.6 like what I had in anaconda, I had to choose at
minimum 5.7 so dove all in with 5.9. And then the fun really
started.</li>
<li>PyQt 5.6 to 5.9 conversion lead to broken imports. I managed
to fix those and handle cases for differences between 5.6 and
5.9 using try/exept pairs (see new BROKEN script below) <br>
</li>
<li>QWebPage changed locations and namespace (some docs here:
<a class="moz-txt-link-freetext" href="http://doc.qt.io/qt-5/qtwebenginewidgets-qtwebkitportingguide.html">http://doc.qt.io/qt-5/qtwebenginewidgets-qtwebkitportingguide.html</a>)<br>
</li>
<li>Then of course the PyQT C library changed so I had to find new
import modules and new sub-modules.</li>
<li>Then PyQt 5.9 introduces some asynchronous functions that
expect call back functions. I really don't understand call back
functions, but I tried creating one anyhow.</li>
<li>The new "pip list" for the script below includes the following</li>
<ul>
<li>"""beautifulsoup4==4.6.0<br>
bs4==0.0.1)<br>
certifi==2017.11.5<br>
chardet==3.0.4<br>
idna==2.6<br>
pip==9.0.1<br>
pkg-resources==0.0.0<br>
PyQt5==5.9.2<br>
QtPy==1.3.1<br>
requests==2.18.4<br>
setuptools==38.2.5<br>
sip==4.19.6<br>
urllib3==1.22<br>
wheel==0.30.0"""</li>
</ul>
</ul>
<p>revised script with new issues:</p>
<p>#!/usr/bin/env python3<br>
# -*- coding: utf-8 -*-<br>
#<br>
# Copyright (c) Tue Dec 19 21:00:56 2017, David Sampson
(<a class="moz-txt-link-abbreviated" href="mailto:samper.d@gmail.com">samper.d@gmail.com</a>)<br>
#<br>
# license: GNU LGPL<br>
#<br>
# This library is free software; you can redistribute it and/or<br>
# modify it under the terms of the GNU Lesser General Public<br>
# License as published by the Free Software Foundation; either<br>
# version 2.1 of the License, or (at your option) any later
version.<br>
<br>
<br>
"""<br>
Created on Tue Dec 19 21:00:56 2017<br>
<br>
@author: sampson<br>
"""<br>
<br>
# Imports<br>
import sys<br>
import requests<br>
#from config import *<br>
from bs4 import BeautifulSoup<br>
from PyQt5.QtCore import QThread, QBasicTimer, QCoreApplication<br>
try:<br>
from PyQt5.QtWebKitWidgets import QWebPage<br>
pyqt="5.6"<br>
except:<br>
from PyQt5.QtWebEngineWidgets import QWebEnginePage as
QWebPage<br>
pyqt="5.9"<br>
<br>
from PyQt5.QtWidgets import QApplication<br>
<br>
# Variables<br>
#url = '<a class="moz-txt-link-freetext" href="http://webscraping.com">http://webscraping.com</a>' <br>
#url='<a class="moz-txt-link-freetext" href="http://www.amazon.com">http://www.amazon.com</a>'<br>
#TickerList = "AEU.UN, BMO, BNS"<br>
Ticker = "BNS"<br>
TickerList = "AEU.UN, BMO"<br>
url=<a class="moz-txt-link-rfc2396E" href="http://www.JsEnabledSite.com">"http://www.JsEnabledSite.com"</a><br>
<br>
# Constants<br>
<br>
<br>
# Main function<br>
<br>
<br>
<br>
<br>
class Render(QWebPage):<br>
"""Render HTML with PyQt5 WebKit."""<br>
def __init__(self, html):<br>
self.html = None<br>
self.app = QCoreApplication.instance()<br>
if self.app is None:<br>
self.app = QApplication(sys.argv)<br>
print("Creating new QApplication instance")<br>
else:<br>
print("using Existing instance of QApplication: %s" %
str(self.app)) <br>
QWebPage.__init__(self)<br>
self.timer = QBasicTimer()<br>
self.timer.start(5000, self)<br>
if self.timer.isActive() == True:<br>
print("timer is active")<br>
print("timer ID: %s" % str(self.timer.timerId()))<br>
else:<br>
print("timer is inactive")<br>
self.loadStarted.connect(self._loadStarted)<br>
self.loadProgress.connect(self._loadProgress)<br>
self.loadFinished.connect(self._loadFinished)<br>
try:<br>
self.mainFrame().setHtml(html)<br>
except:<br>
self.setHtml(html)<br>
self.app.exec()<br>
<br>
<br>
def _loadFinished(self, result):<br>
if pyqt == "5.9":<br>
#self.html = self.toHtml(get_html())<br>
print("fetching HTML")<br>
self.toHtml(self.setHtml())<br>
print("HTML Fetched")<br>
<br>
if pyqt == "5.6":<br>
self.html = self.mainFrame().toHtml()<br>
print("Finihsed function passed")<br>
<br>
else:<br>
print("you are running pyqt version: unknown")<br>
<br>
print("Load Finished")<br>
self.timer.stop()<br>
if self.timer.isActive() == True:<br>
print("timer is active")<br>
print("timer ID: %s" % str(self.timer.timerId()))<br>
else:<br>
print("timer is inactive")<br>
QApplication.instance().quit()<br>
#self.app.quit()<br>
<br>
def _loadStarted(self):<br>
print("Page is loading content")<br>
<br>
def _loadProgress(self, progress):<br>
print("Page is loading: %s" % progress)<br>
<br>
def get_html(self, result):<br>
print(self.html)<br>
print(result)<br>
return result<br>
<br>
<br>
<br>
def main():<br>
print("url: %s" % url) <br>
get_page(url) <br>
<br>
<br>
def get_page(Url):<br>
"""<br>
This module accepts a URL and returns a page with rendered
JavaScript <br>
using PyQt5 Webkit.Stock<br>
"""<br>
#global app<br>
# get the raw HTML<br>
SourceHtml = requests.get(Url).text<br>
<br>
#app = QApplication(sys.argv)<br>
#app = QApplication.instance()<br>
#if app is None:<br>
# app = QApplication(sys.argv)<br>
#else:<br>
# print("using Existing instance of QApplication: %s" %
str(app))<br>
RenderedHtml = Render(SourceHtml).html<br>
#app.exec_()<br>
<br>
#sys.exit(app.exec())<br>
<br>
#print(RenderedHtml)<br>
print("Finished")<br>
print(RenderedHtml)<br>
return RenderedHtml<br>
<br>
<br>
if __name__ == '__main__':<br>
main()<br>
#get_page(url)<br>
</p>
<br>
<div class="moz-cite-prefix">On 01/01/2018 03:32 AM, Barry wrote:<br>
</div>
<blockquote type="cite"
cite="mid:2C9F0319-65C9-4627-A31A-7D1B733409B3@barrys-emacs.org">
<meta http-equiv="content-type" content="text/html; charset=utf-8">
<div>What happens if you run your script outside of spyder from
the bash prompt?</div>
<div>Maybe its sypder that is breaking things becuase its in
control not your code.</div>
<div><br>
</div>
<div>Barry</div>
<div><br>
On 1 Jan 2018, at 04:02, Dave Sampson <<a
href="mailto:samper.d@gmail.com" moz-do-not-send="true">samper.d@gmail.com</a>>
wrote:<br>
<br>
</div>
<blockquote type="cite">
<div>
<meta http-equiv="content-type" content="text/html;
charset=utf-8">
<p>Hey Folks,</p>
<p>A special thanks to anyone who takes time to read about my
current situation and provide part of or all of the
solution.<br>
</p>
<p>The Issue:</p>
<p>==========<br>
</p>
<p>As part of a larger project, I am wanting to develop a
function, class or module using python that will accept a
URL of a javascript enabled website and return the page's
HTML contents generated by those JS scripts. Please see the
end of this post for a partially functioning script.<br>
</p>
<p>I used this reference as my initial source that got me
using the PyQt bindings.
<a class="moz-txt-link-freetext"
href="https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/"
moz-do-not-send="true">https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/</a></p>
<p>Much of the documentation refers to PyQt4, I am using
PyQt5.6, which is sometimes not in line with more recent
PyQt5 documentation. So there has been lots of trial and
error along the way.<br>
</p>
<p>I am able to run the below script once, however upon a
second run of the script I keep running up against this
error: "QBasicTimer::start: Stopping previous timer failed.
Possibly trying to stop from a different thread".</p>
<p>I will need to scrape at minimum 3 separate URL's each time
this module/class is called. I assume that once I get the
solution to make two successful runs then running 10's or
100's should also work.</p>
<p>This module will be called from another control module. I
have little interest in persistent objects like
QApplication, between the different calls.</p>
<p>At this point I am getting a bit frustrated that I can't
just send a function a URL and receive back HTML in rapid
succession. If JS was not involved I would not have this
issue. Using requests, urllib and BeautifulSoup for walking
through the HTMLDOC are all working fine.</p>
<p>Please note I am not a C programmer so being able to
abstract concepts from C documentation to python is
sometimes a challenge. I would consider myself an
intermediate python scripter who learned on 2.7 and now
consistently use 3.6 for my projects. Pardon if some syntax
smells of 2.x. I try mostly to follow PEP8 once I have solid
code in place.</p>
<p>Also note that I am not a classically trained developer, I
am geographer who designs Geographic Information Systems
(GIS), so the concept of classes are still a bit abstract. I
understand analogies of classes for describing cars, robots
and a pizza making process, but leveraging PyQT for web
scraping and creating GUI objects that will never render
anything visually is a bit confusing. Maybe someday urllib
will process JS. Generally I use python for automation of
data management tasks and processes in the geomatics domain.
So please assume I need some explanation if your response is
"remember when using classes do this and that", I likely
never knew what you will refer to. Assume I know nothing
about classes and the lineage of inheritance of PyQt
objects. (smile)<br>
</p>
<p>The environment:</p>
<p>============<br>
</p>
<p>* Ubuntu 16.04 LTS<br>
</p>
<p>* Anaconda navigator 1.6.11</p>
<p> * Python 3 dedicated environment<br>
</p>
<p>* Python 3.6.3 (64 bit)<br>
</p>
<p>* Spyder 3.2.5<br>
</p>
<p>* PyQt 5.6</p>
<p>* Qt 5.6.2<br>
</p>
<p><br>
</p>
<p>Sources:</p>
<p>=======<br>
</p>
<p>These are some of the sources I have used to try and solve
this problem. I did not capture all of the sources used as
many other resources point back to these sources:</p>
<p>*
<a class="moz-txt-link-freetext"
href="https://stackoverflow.com/questions/6180293/pyqt-timers-cannot-be-started-from-another-thread"
moz-do-not-send="true">https://stackoverflow.com/questions/6180293/pyqt-timers-cannot-be-started-from-another-thread</a></p>
<p>*
<a class="moz-txt-link-freetext"
href="https://forum.qt.io/topic/13459/timers-cannot-be-stopped-from-another-thread-but-how-do-i-stop-start-timer-in-thread"
moz-do-not-send="true">https://forum.qt.io/topic/13459/timers-cannot-be-stopped-from-another-thread-but-how-do-i-stop-start-timer-in-thread</a></p>
<p>* <a class="moz-txt-link-freetext"
href="https://github.com/spyder-ide/spyder/issues/974"
moz-do-not-send="true">https://github.com/spyder-ide/spyder/issues/974</a></p>
<p>6. [Web Scraping Primer using Webkit
(<a class="moz-txt-link-freetext"
href="https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/"
moz-do-not-send="true">https://impythonist.wordpress.com/2015/01/06/ultimate-guide-for-scraping-javascript-rendered-web-pages/</a>)]<br>
7. [How to run PyQt applications within Spyder
(<a class="moz-txt-link-freetext"
href="https://github.com/spyder-ide/spyder/wiki/How-to-run-PyQt-applications-within-Spyder"
moz-do-not-send="true">https://github.com/spyder-ide/spyder/wiki/How-to-run-PyQt-applications-within-Spyder</a>)]<br>
8. [PyQT code can not run twice when using Spyder IDE (<a
class="moz-txt-link-freetext"
href="https://github.com/spyder-ide/spyder/issues/2970"
moz-do-not-send="true">https://github.com/spyder-ide/spyder/issues/2970</a>)]</p>
<p><br>
</p>
<p>Problem solving approaches:</p>
<p>==================</p>
<ol>
<li>The first issue I came across are related to running
PyQT object like QApplication within the Spyder IDE since
there is already a QApplication object created by Spyder.
So this problem lead to a feew different approaches.</li>
<ol>
<li>Moved app = QApplication() outside of the __init__()
function of the class</li>
<li>Added a conditional check for QApplication.instance()
to either reuse the existing object or create one if
missing. This depends on if you run the code inside or
outside of a QT environment. For example running at the
command line should create a new instance, where running
in Spyder reuses the existing instance</li>
<li>Created the App object in another module that imports
this one. The same result is achieved since the
namespace of the imported module perpetuates the same
namespace.</li>
<li>I have even tried overriding various names in the
namespace without success. I understand this is
dangerous, but wanted to try and isolate what object is
creating the timer.<br>
</li>
<li>Resetting the whole namespace losses some valuable
settings found in "from config.py import *". for the
purpose of the script below that line can remain
commented</li>
<li>Moving the creation of the App object outside the
class caused some other issues when the page was loaded
and the app.quit() function is called. So I tried
grabbing the app object created outside the class, but I
could never grab onto it to close it, so it would hang.
I wondered if I could then call a function external to
the class<br>
</li>
</ol>
<li>So after trying to attack the persistent QApplication
Object I went after trying to solve the QBasicTimer issue.</li>
<ol>
<li>I tried killing the timer with app.killTimer() before
the app.quit() is called once the page is loaded.
However, no timer was found. this suggests that some
other object was creating a timer. I could not track
that down.</li>
<li>Although the self.app object created by QApplication
had a app.killTimer() function there were no start() or
stop() timer methods, and I could not find out what
value was left on the timer. So I tried creating a timer
object within the class for the app object. This allowed
me to start and stop timers (set to 5 seconds), however
even after verifying that the timer was created,
started, killed and then no longer existing I still get
the QBasicTimmer error.</li>
<li>In some cases, if I killed the time too early the
program would hang. Killing the timer after the
app.quit() function rendered no timer. So did I create a
timer that still ran after the app died? Or is the timer
somewhere else?<br>
</li>
</ol>
<li>The next area I started to explore was the concept of
creating threads with QThreads. The sources I read were
giving this concept high hopes. I soon was reminded I am
horrible at reading C documentation and quickly gave up.
Besides, even creating threads may bring those persistent
objects into my executed namespace.<br>
</li>
<li>External processes was my next approach. <br>
</li>
<ol>
<li>Could I use something like "from subprocess import
call" and then I could just call([python
this-script.py", <a class="moz-txt-link-rfc2396E"
href="http:%5C%5Cthis.url.com" moz-do-not-send="true">"http:\\this.url.com"</a>])
as a subprocess. Thinking that once the subprocess would
run then the process would die along with any timers. I
could also create the QApplication object in a jail and
not polute my namespace. I could not even get simple
linux commands like "ls" to return anything meaningful,
let alone a "return" value from the script. result =
fail.</li>
</ol>
<li>Why use Anaconda/spyder at all? I came across these
types of responses in various posts to similar probelms.
It is a fair enough question, so let me try to address
some points to consider.<br>
</li>
<ol>
<li>I work in government and think it is high time that
any public servant who has an idea to automate something
in their workflow should have easy tools to help.
Anaconda could be this tool.<br>
</li>
<li>If the final solution is to strip out everything
except for a text editor (vim) and command line (BASH)
then I will take it. At the end of the day I need
something that works. I tend to work in these types of
stripped down environments through SSH on servers
anyways. But perhaps there is an alternative solution.</li>
<li>I have been using Python and BASH for years and like
the lightweight approach, however it is not for the
faint of heart. I just recently started using Anaconda
as a general data wrangling platform and I like its
elegance. I have been promoting it with other non
developers and govies (aka public servants) playing in
the data space. I feel the platform is and could be a
great equalizer. I would feel pretty silly recomending
this platform if it could not support simple
web-scrapping. result = fail!</li>
<li>I also just learned about Spyder, through using
Anaconda. Spyder provides me what I need and emulates my
lightweight setup of a text editor, Command Line and
File browser setup. Also I like the debug tools, being
able to see all the variables in the namespace, create
stop points and control step throughs. All of these
things I never had at the command line. and Ipython is
quite intriguing. I think I am just at the start of
another journey towards efficiency.</li>
<li>I am a big believer that tech tools should not
influence or impede workflows. In theory this should all
work in anaconda</li>
<li>I find setting up individual virtual environments
tedious for the various python projects I have on the
go.</li>
</ol>
</ol>
<p>That is a general overview of my battles with PyQt for web
scrapping over the past couple of weeks. So now I turn to
the mailing list of creative, professional and motivated
PyQt users to let me know what obvious solution I am
missing.<br>
</p>
<p>So now I present to you fine and dedicated list viewers my
script for consideration. I look forward to learning what
solution(s) come forward.</p>
<p>I am prepared to be humbled.<br>
</p>
<p><br>
</p>
<p>The Script:</p>
<p>========</p>
<p>This module should be able to run once in either the Spyder
IPython Console or an anacoda terminal window. When you run
it a second time you should get the error. I have left many
other code snipets intact using comments to let you know
some of the other approaches I have taken.</p>
<p>#!/usr/bin/env python3<br>
# -*- coding: utf-8 -*-<br>
#<br>
# Copyright (c) Tue Dec 19 21:00:56 2017, David Sampson (<a
class="moz-txt-link-abbreviated"
href="mailto:samper.d@gmail.com" moz-do-not-send="true">samper.d@gmail.com</a>)<br>
#<br>
# license: GNU LGPL<br>
#<br>
# This library is free software; you can redistribute it
and/or<br>
# modify it under the terms of the GNU Lesser General
Public<br>
# License as published by the Free Software Foundation;
either<br>
# version 2.1 of the License, or (at your option) any later
version.<br>
<br>
<br>
"""<br>
Created on Tue Dec 19 21:00:56 2017<br>
<br>
@author: sampson<br>
"""<br>
<br>
# Imports<br>
import sys<br>
import requests<br>
#from config import *<br>
from PyQt5.QtCore import QThread, QBasicTimer,
QCoreApplication<br>
from PyQt5.QtWebKitWidgets import QWebPage<br>
from PyQt5.QtWidgets import QApplication<br>
<br>
# Variables<br>
#url = '<a class="moz-txt-link-freetext"
href="http://webscraping.com" moz-do-not-send="true">http://webscraping.com</a>'
<br>
#url='<a class="moz-txt-link-freetext"
href="http://www.amazon.com" moz-do-not-send="true">http://www.amazon.com</a>'<br>
</p>
<p>### EDIT THIS###<br>
url=<a class="moz-txt-link-rfc2396E"
href="http://www.JsEnabledSite.com" moz-do-not-send="true">"http://www.JsEnabledSite.com"</a><br>
<br>
# Constants<br>
<br>
<br>
# Main function<br>
def main():<br>
get_page(url) <br>
<br>
<br>
<br>
class Render(QWebPage):<br>
"""Render HTML with PyQt5 WebKit."""<br>
def __init__(self, html):<br>
self.html = None<br>
self.app = QCoreApplication.instance()<br>
if self.app is None:<br>
self.app = QApplication(sys.argv)<br>
print("Creating new QApplication instance")<br>
else:<br>
print("using Existing instance of QApplication:
%s" % str(self.app)) <br>
QWebPage.__init__(self)<br>
self.timer = QBasicTimer()<br>
self.timer.start(5000, self)<br>
if self.timer.isActive() == True:<br>
print("timer is active")<br>
print("timer ID: %s" %
str(self.timer.timerId()))<br>
else:<br>
print("timer is inactive")<br>
self.loadFinished.connect(self._loadFinished)<br>
self.mainFrame().setHtml(html)<br>
self.app.exec()<br>
<br>
<br>
def _loadFinished(self, result):<br>
self.html = self.mainFrame().toHtml()<br>
print("Load Finished")<br>
self.timer.stop()<br>
if self.timer.isActive() == True:<br>
print("timer is active")<br>
print("timer ID: %s" %
str(self.timer.timerId()))<br>
else:<br>
print("timer is inactive")<br>
QApplication.instance().quit()<br>
#self.app.quit()<br>
<br>
<br>
<br>
<br>
def get_page(Url):<br>
"""<br>
This module accepts a URL and returns a page with
rendered JavaScript <br>
using PyQt5 Webkit.Stock<br>
"""<br>
#global app<br>
# get the raw HTML<br>
SourceHtml = requests.get(Url).text<br>
<br>
#app = QApplication(sys.argv)<br>
#app = QApplication.instance()<br>
#if app is None:<br>
# app = QApplication(sys.argv)<br>
#else:<br>
# print("using Existing instance of QApplication: %s"
% str(app))<br>
RenderedHtml = Render(SourceHtml).html<br>
#app.exec_()<br>
<br>
#sys.exit(app.exec())<br>
<br>
#print(RenderedHtml)<br>
print("Finished")<br>
return RenderedHtml<br>
<br>
<br>
if __name__ == '__main__':<br>
main()<br>
#get_page(url)<br>
</p>
<p><br>
</p>
</div>
</blockquote>
<blockquote type="cite">
<div><span>_______________________________________________</span><br>
<span>PyQt mailing list <a
href="mailto:PyQt@riverbankcomputing.com"
moz-do-not-send="true">PyQt@riverbankcomputing.com</a></span><br>
<span><a
href="https://www.riverbankcomputing.com/mailman/listinfo/pyqt"
moz-do-not-send="true">https://www.riverbankcomputing.com/mailman/listinfo/pyqt</a></span></div>
</blockquote>
</blockquote>
<br>
</body>
</html>