BeautifulSoup - Web crawler searches duplicated links












3















I'm trying to make web crawler finds external hyperlink of the web page recursively.



With following code, crawler works well but it searches and inserts the link that already saved in database.



I added SELECT query to count rows that have same link but nothing has changed.



What is the problem?



Code:



def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]

# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))

conn.commit()

else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))

conn.commit()


def split_address(addr):
address_parts = None

if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")

return address_parts


def get_random_external_links(starting_page):
html = urlopen(starting_page)

try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1

title = bs_obj.find("title")

# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())

cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])

if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]

if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]

return selected_tuple


def find_random_link(url):
get_link = get_random_external_link(url)

if get_link == -1:
return -1
else:
return find_random_link(get_link)


DB "external_links":



+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+


enter image description here










share|improve this question

























  • You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

    – avram
    Jan 20 at 10:35











  • can you please print query after executing it and tell us query in comment below

    – dvijparekh
    Jan 20 at 10:36
















3















I'm trying to make web crawler finds external hyperlink of the web page recursively.



With following code, crawler works well but it searches and inserts the link that already saved in database.



I added SELECT query to count rows that have same link but nothing has changed.



What is the problem?



Code:



def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]

# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))

conn.commit()

else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))

conn.commit()


def split_address(addr):
address_parts = None

if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")

return address_parts


def get_random_external_links(starting_page):
html = urlopen(starting_page)

try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1

title = bs_obj.find("title")

# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())

cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])

if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]

if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]

return selected_tuple


def find_random_link(url):
get_link = get_random_external_link(url)

if get_link == -1:
return -1
else:
return find_random_link(get_link)


DB "external_links":



+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+


enter image description here










share|improve this question

























  • You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

    – avram
    Jan 20 at 10:35











  • can you please print query after executing it and tell us query in comment below

    – dvijparekh
    Jan 20 at 10:36














3












3








3








I'm trying to make web crawler finds external hyperlink of the web page recursively.



With following code, crawler works well but it searches and inserts the link that already saved in database.



I added SELECT query to count rows that have same link but nothing has changed.



What is the problem?



Code:



def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]

# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))

conn.commit()

else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))

conn.commit()


def split_address(addr):
address_parts = None

if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")

return address_parts


def get_random_external_links(starting_page):
html = urlopen(starting_page)

try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1

title = bs_obj.find("title")

# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())

cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])

if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]

if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]

return selected_tuple


def find_random_link(url):
get_link = get_random_external_link(url)

if get_link == -1:
return -1
else:
return find_random_link(get_link)


DB "external_links":



+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+


enter image description here










share|improve this question
















I'm trying to make web crawler finds external hyperlink of the web page recursively.



With following code, crawler works well but it searches and inserts the link that already saved in database.



I added SELECT query to count rows that have same link but nothing has changed.



What is the problem?



Code:



def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]

# Get matching rows
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

if select_in_return == 0:
if link.attrs["href"].startswith("//"):
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))

conn.commit()

else:
cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))

conn.commit()


def split_address(addr):
address_parts = None

if "https" in addr:
address_parts = addr.replace("https://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")
elif "http" in addr:
address_parts = addr.replace("http://", "").split("/")
if "www" in address_parts[0]:
address_parts = address_parts[0].replace("www.", "")

return address_parts


def get_random_external_links(starting_page):
html = urlopen(starting_page)

try:
bs_obj = BeautifulSoup(html, "html.parser")
except AttributeError as e:
return -1

title = bs_obj.find("title")

# Get scheme, netloc and title of URI and pass them to add_external_links()
add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())

cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")
fetch = cur.fetchall()
selected_tuple = str(fetch[0][0])

if selected_tuple.startswith("b'"):
selected_tuple = selected_tuple[2:]

if selected_tuple.endswith("'"):
selected_tuple = selected_tuple[:-1]

return selected_tuple


def find_random_link(url):
get_link = get_random_external_link(url)

if get_link == -1:
return -1
else:
return find_random_link(get_link)


DB "external_links":



+----------+--------------+------+-----+---------+----------------+
| Field | Type | Null | Key | Default | Extra |
+----------+--------------+------+-----+---------+----------------+
| idx | int(11) | NO | PRI | <null> | auto_increment |
| href | blob | NO | | <null> | |
| title | varchar(255) | NO | | <null> | |
| contents | blob | NO | | <null> | |
+----------+--------------+------+-----+---------+----------------+


enter image description here







python beautifulsoup






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Jan 20 at 15:06







NBlizz

















asked Jan 20 at 10:00









NBlizzNBlizz

159111




159111













  • You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

    – avram
    Jan 20 at 10:35











  • can you please print query after executing it and tell us query in comment below

    – dvijparekh
    Jan 20 at 10:36



















  • You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

    – avram
    Jan 20 at 10:35











  • can you please print query after executing it and tell us query in comment below

    – dvijparekh
    Jan 20 at 10:36

















You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35





You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35













can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36





can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36












1 Answer
1






active

oldest

votes


















1














This is because of different uri style.



 - https://www.google.com
- https://google.com
- http://www.google.com
- http://google.com
- //www.google.com
- //google.com
- www.google.com


These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.



Solution:



def add_external_links(bs_obj, scheme, exclude_url, title):
for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
# Remove protocol(https:// or http:// or //) and host(www.) from URI
if link.attrs["href"].startswith("//"):
link.attrs["href"] = link.attrs["href"][2:]
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "https" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("https://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "http" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("http://", "")
if "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")
elif "www" in link.attrs["href"]:
link.attrs["href"] = link.attrs["href"].replace("www.", "")

# Remove trailing slash
if link.attrs["href"].endswith("/"):
link.attrs["href"] = link.attrs["href"][:-1]

# Reassemble URI
link.attrs["href"] = scheme + "://" + link.attrs["href"]

# Get rows matching with URI
select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

# Add URI to database if it is not duplicated
if select_in_return == 0:
cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
(0, link.attrs["href"], title, "Temp contents",))
conn.commit()





share|improve this answer

























    Your Answer






    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "1"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    autoActivateHeartbeat: false,
    convertImagesToLinks: true,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: 10,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });














    draft saved

    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54275320%2fbeautifulsoup-web-crawler-searches-duplicated-links%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes









    1














    This is because of different uri style.



     - https://www.google.com
    - https://google.com
    - http://www.google.com
    - http://google.com
    - //www.google.com
    - //google.com
    - www.google.com


    These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.



    Solution:



    def add_external_links(bs_obj, scheme, exclude_url, title):
    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
    # Remove protocol(https:// or http:// or //) and host(www.) from URI
    if link.attrs["href"].startswith("//"):
    link.attrs["href"] = link.attrs["href"][2:]
    if "www" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("www.", "")
    elif "https" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("https://", "")
    if "www" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("www.", "")
    elif "http" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("http://", "")
    if "www" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("www.", "")
    elif "www" in link.attrs["href"]:
    link.attrs["href"] = link.attrs["href"].replace("www.", "")

    # Remove trailing slash
    if link.attrs["href"].endswith("/"):
    link.attrs["href"] = link.attrs["href"][:-1]

    # Reassemble URI
    link.attrs["href"] = scheme + "://" + link.attrs["href"]

    # Get rows matching with URI
    select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

    # Add URI to database if it is not duplicated
    if select_in_return == 0:
    cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
    (0, link.attrs["href"], title, "Temp contents",))
    conn.commit()





    share|improve this answer






























      1














      This is because of different uri style.



       - https://www.google.com
      - https://google.com
      - http://www.google.com
      - http://google.com
      - //www.google.com
      - //google.com
      - www.google.com


      These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.



      Solution:



      def add_external_links(bs_obj, scheme, exclude_url, title):
      for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
      # Remove protocol(https:// or http:// or //) and host(www.) from URI
      if link.attrs["href"].startswith("//"):
      link.attrs["href"] = link.attrs["href"][2:]
      if "www" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("www.", "")
      elif "https" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("https://", "")
      if "www" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("www.", "")
      elif "http" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("http://", "")
      if "www" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("www.", "")
      elif "www" in link.attrs["href"]:
      link.attrs["href"] = link.attrs["href"].replace("www.", "")

      # Remove trailing slash
      if link.attrs["href"].endswith("/"):
      link.attrs["href"] = link.attrs["href"][:-1]

      # Reassemble URI
      link.attrs["href"] = scheme + "://" + link.attrs["href"]

      # Get rows matching with URI
      select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

      # Add URI to database if it is not duplicated
      if select_in_return == 0:
      cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
      (0, link.attrs["href"], title, "Temp contents",))
      conn.commit()





      share|improve this answer




























        1












        1








        1







        This is because of different uri style.



         - https://www.google.com
        - https://google.com
        - http://www.google.com
        - http://google.com
        - //www.google.com
        - //google.com
        - www.google.com


        These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.



        Solution:



        def add_external_links(bs_obj, scheme, exclude_url, title):
        for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
        # Remove protocol(https:// or http:// or //) and host(www.) from URI
        if link.attrs["href"].startswith("//"):
        link.attrs["href"] = link.attrs["href"][2:]
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "https" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("https://", "")
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "http" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("http://", "")
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")

        # Remove trailing slash
        if link.attrs["href"].endswith("/"):
        link.attrs["href"] = link.attrs["href"][:-1]

        # Reassemble URI
        link.attrs["href"] = scheme + "://" + link.attrs["href"]

        # Get rows matching with URI
        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

        # Add URI to database if it is not duplicated
        if select_in_return == 0:
        cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
        (0, link.attrs["href"], title, "Temp contents",))
        conn.commit()





        share|improve this answer















        This is because of different uri style.



         - https://www.google.com
        - https://google.com
        - http://www.google.com
        - http://google.com
        - //www.google.com
        - //google.com
        - www.google.com


        These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.



        Solution:



        def add_external_links(bs_obj, scheme, exclude_url, title):
        for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):
        # Remove protocol(https:// or http:// or //) and host(www.) from URI
        if link.attrs["href"].startswith("//"):
        link.attrs["href"] = link.attrs["href"][2:]
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "https" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("https://", "")
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "http" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("http://", "")
        if "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")
        elif "www" in link.attrs["href"]:
        link.attrs["href"] = link.attrs["href"].replace("www.", "")

        # Remove trailing slash
        if link.attrs["href"].endswith("/"):
        link.attrs["href"] = link.attrs["href"][:-1]

        # Reassemble URI
        link.attrs["href"] = scheme + "://" + link.attrs["href"]

        # Get rows matching with URI
        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))

        # Add URI to database if it is not duplicated
        if select_in_return == 0:
        cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",
        (0, link.attrs["href"], title, "Temp contents",))
        conn.commit()






        share|improve this answer














        share|improve this answer



        share|improve this answer








        edited Jan 20 at 15:14

























        answered Jan 20 at 15:06









        NBlizzNBlizz

        159111




        159111
































            draft saved

            draft discarded




















































            Thanks for contributing an answer to Stack Overflow!


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.




            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54275320%2fbeautifulsoup-web-crawler-searches-duplicated-links%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            Popular posts from this blog

            Liquibase includeAll doesn't find base path

            How to use setInterval in EJS file?

            Petrus Granier-Deferre