BeautifulSoup - Web crawler searches duplicated links
I'm trying to make web crawler finds external hyperlink of the web page recursively.
With following code, crawler works well but it searches and inserts the link that already saved in database.
I added SELECT
query to count rows that have same link but nothing has changed.
What is the problem?
Code:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
conn.commit()
else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
def split_address(addr):
address_parts = None
if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
return address_parts
def get_random_external_links(starting_page):
html = urlopen(starting_page)
try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1
title = bs_obj.find("title")
# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])
if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]
if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]
return selected_tuple
def find_random_link(url):
get_link = get_random_external_link(url)
if get_link == -1:
return -1
else:
return find_random_link(get_link)
DB "external_links":
+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+
python beautifulsoup
add a comment |
I'm trying to make web crawler finds external hyperlink of the web page recursively.
With following code, crawler works well but it searches and inserts the link that already saved in database.
I added SELECT
query to count rows that have same link but nothing has changed.
What is the problem?
Code:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
conn.commit()
else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
def split_address(addr):
address_parts = None
if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
return address_parts
def get_random_external_links(starting_page):
html = urlopen(starting_page)
try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1
title = bs_obj.find("title")
# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])
if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]
if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]
return selected_tuple
def find_random_link(url):
get_link = get_random_external_link(url)
if get_link == -1:
return -1
else:
return find_random_link(get_link)
DB "external_links":
+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+
python beautifulsoup
You should check if links exist with fetchone():if select_in_return.fetchone()
. It will return None if there are no matches.
– avram
Jan 20 at 10:35
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36
add a comment |
I'm trying to make web crawler finds external hyperlink of the web page recursively.
With following code, crawler works well but it searches and inserts the link that already saved in database.
I added SELECT
query to count rows that have same link but nothing has changed.
What is the problem?
Code:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
conn.commit()
else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
def split_address(addr):
address_parts = None
if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
return address_parts
def get_random_external_links(starting_page):
html = urlopen(starting_page)
try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1
title = bs_obj.find("title")
# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])
if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]
if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]
return selected_tuple
def find_random_link(url):
get_link = get_random_external_link(url)
if get_link == -1:
return -1
else:
return find_random_link(get_link)
DB "external_links":
+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+
python beautifulsoup
I'm trying to make web crawler finds external hyperlink of the web page recursively.
With following code, crawler works well but it searches and inserts the link that already saved in database.
I added SELECT
query to count rows that have same link but nothing has changed.
What is the problem?
Code:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))
conn.commit()
else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
def split_address(addr):
address_parts = None
if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
return address_parts
def get_random_external_links(starting_page):
html = urlopen(starting_page)
try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1
title = bs_obj.find("title")
# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())
cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])
if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]
if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]
return selected_tuple
def find_random_link(url):
get_link = get_random_external_link(url)
if get_link == -1:
return -1
else:
return find_random_link(get_link)
DB "external_links":
+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+
python beautifulsoup
python beautifulsoup
edited Jan 20 at 15:06
NBlizz
asked Jan 20 at 10:00
NBlizzNBlizz
159111
159111
You should check if links exist with fetchone():if select_in_return.fetchone()
. It will return None if there are no matches.
– avram
Jan 20 at 10:35
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36
add a comment |
You should check if links exist with fetchone():if select_in_return.fetchone()
. It will return None if there are no matches.
– avram
Jan 20 at 10:35
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36
You should check if links exist with fetchone():
if select_in_return.fetchone()
. It will return None if there are no matches.– avram
Jan 20 at 10:35
You should check if links exist with fetchone():
if select_in_return.fetchone()
. It will return None if there are no matches.– avram
Jan 20 at 10:35
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36
add a comment |
1 Answer
1
active
oldest
votes
This is because of different uri style.
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
These seven links are same address, but it isn't checked until enter into if select_in_return == 0:
block. After executing INSERT INTO
query they will become same address, but they are considered as different link while executing SELECT
query, so duplicated link is stored.
Solution:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54275320%2fbeautifulsoup-web-crawler-searches-duplicated-links%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
This is because of different uri style.
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
These seven links are same address, but it isn't checked until enter into if select_in_return == 0:
block. After executing INSERT INTO
query they will become same address, but they are considered as different link while executing SELECT
query, so duplicated link is stored.
Solution:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
add a comment |
This is because of different uri style.
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
These seven links are same address, but it isn't checked until enter into if select_in_return == 0:
block. After executing INSERT INTO
query they will become same address, but they are considered as different link while executing SELECT
query, so duplicated link is stored.
Solution:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
add a comment |
This is because of different uri style.
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
These seven links are same address, but it isn't checked until enter into if select_in_return == 0:
block. After executing INSERT INTO
query they will become same address, but they are considered as different link while executing SELECT
query, so duplicated link is stored.
Solution:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
This is because of different uri style.
- https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com
These seven links are same address, but it isn't checked until enter into if select_in_return == 0:
block. After executing INSERT INTO
query they will become same address, but they are considered as different link while executing SELECT
query, so duplicated link is stored.
Solution:
def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]
# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]
# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))
# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()
edited Jan 20 at 15:14
answered Jan 20 at 15:06
NBlizzNBlizz
159111
159111
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54275320%2fbeautifulsoup-web-crawler-searches-duplicated-links%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
You should check if links exist with fetchone():
if select_in_return.fetchone()
. It will return None if there are no matches.– avram
Jan 20 at 10:35
can you please print query after executing it and tell us query in comment below
– dvijparekh
Jan 20 at 10:36