BeautifulSoup - Web crawler searches duplicated links

I'm trying to make web crawler finds external hyperlink of the web page recursively.

With following code, crawler works well but it searches and inserts the link that already saved in database.

I added SELECT query to count rows that have same link but nothing has changed.

What is the problem?

Code:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Get matching rows

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        if select_in_return == 0:

            if link.attrs["href"].startswith("//"):

                cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))



                conn.commit()



            else:

                cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, link.attrs["href"], title, "Temp contents",))



                conn.commit()





def split_address(addr):

    address_parts = None



    if "https" in addr:

        address_parts = addr.replace("https://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")

    elif "http" in addr:

        address_parts = addr.replace("http://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")



    return address_parts





def get_random_external_links(starting_page):

    html = urlopen(starting_page)



    try:

        bs_obj = BeautifulSoup(html, "html.parser")

    except AttributeError as e:

        return -1



    title = bs_obj.find("title")



    # Get scheme, netloc and title of URI and pass them to add_external_links()

    add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())



    cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")

    fetch = cur.fetchall()

    selected_tuple = str(fetch[0][0])



    if selected_tuple.startswith("b'"):

        selected_tuple = selected_tuple[2:]



    if selected_tuple.endswith("'"):

        selected_tuple = selected_tuple[:-1]



    return selected_tuple





def find_random_link(url):

    get_link = get_random_external_link(url)



    if get_link == -1:

        return -1

    else:

        return find_random_link(get_link)

DB "external_links":

+----------+--------------+------+-----+---------+----------------+

| Field    | Type         | Null | Key | Default | Extra          |

+----------+--------------+------+-----+---------+----------------+

| idx      | int(11)      | NO   | PRI | <null>  | auto_increment |

| href     | blob         | NO   |     | <null>  |                |

| title    | varchar(255) | NO   |     | <null>  |                |

| contents | blob         | NO   |     | <null>  |                |

+----------+--------------+------+-----+---------+----------------+

enter image description here

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35

can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36

add a comment |

I'm trying to make web crawler finds external hyperlink of the web page recursively.

With following code, crawler works well but it searches and inserts the link that already saved in database.

I added SELECT query to count rows that have same link but nothing has changed.

What is the problem?

Code:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Get matching rows

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        if select_in_return == 0:

            if link.attrs["href"].startswith("//"):

                cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))



                conn.commit()



            else:

                cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, link.attrs["href"], title, "Temp contents",))



                conn.commit()





def split_address(addr):

    address_parts = None



    if "https" in addr:

        address_parts = addr.replace("https://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")

    elif "http" in addr:

        address_parts = addr.replace("http://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")



    return address_parts





def get_random_external_links(starting_page):

    html = urlopen(starting_page)



    try:

        bs_obj = BeautifulSoup(html, "html.parser")

    except AttributeError as e:

        return -1



    title = bs_obj.find("title")



    # Get scheme, netloc and title of URI and pass them to add_external_links()

    add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())



    cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")

    fetch = cur.fetchall()

    selected_tuple = str(fetch[0][0])



    if selected_tuple.startswith("b'"):

        selected_tuple = selected_tuple[2:]



    if selected_tuple.endswith("'"):

        selected_tuple = selected_tuple[:-1]



    return selected_tuple





def find_random_link(url):

    get_link = get_random_external_link(url)



    if get_link == -1:

        return -1

    else:

        return find_random_link(get_link)

DB "external_links":

+----------+--------------+------+-----+---------+----------------+

| Field    | Type         | Null | Key | Default | Extra          |

+----------+--------------+------+-----+---------+----------------+

| idx      | int(11)      | NO   | PRI | <null>  | auto_increment |

| href     | blob         | NO   |     | <null>  |                |

| title    | varchar(255) | NO   |     | <null>  |                |

| contents | blob         | NO   |     | <null>  |                |

+----------+--------------+------+-----+---------+----------------+

enter image description here

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35

can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36

add a comment |

I'm trying to make web crawler finds external hyperlink of the web page recursively.

With following code, crawler works well but it searches and inserts the link that already saved in database.

I added SELECT query to count rows that have same link but nothing has changed.

What is the problem?

Code:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Get matching rows

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        if select_in_return == 0:

            if link.attrs["href"].startswith("//"):

                cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))



                conn.commit()



            else:

                cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, link.attrs["href"], title, "Temp contents",))



                conn.commit()





def split_address(addr):

    address_parts = None



    if "https" in addr:

        address_parts = addr.replace("https://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")

    elif "http" in addr:

        address_parts = addr.replace("http://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")



    return address_parts





def get_random_external_links(starting_page):

    html = urlopen(starting_page)



    try:

        bs_obj = BeautifulSoup(html, "html.parser")

    except AttributeError as e:

        return -1



    title = bs_obj.find("title")



    # Get scheme, netloc and title of URI and pass them to add_external_links()

    add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())



    cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")

    fetch = cur.fetchall()

    selected_tuple = str(fetch[0][0])



    if selected_tuple.startswith("b'"):

        selected_tuple = selected_tuple[2:]



    if selected_tuple.endswith("'"):

        selected_tuple = selected_tuple[:-1]



    return selected_tuple





def find_random_link(url):

    get_link = get_random_external_link(url)



    if get_link == -1:

        return -1

    else:

        return find_random_link(get_link)

DB "external_links":

+----------+--------------+------+-----+---------+----------------+

| Field    | Type         | Null | Key | Default | Extra          |

+----------+--------------+------+-----+---------+----------------+

| idx      | int(11)      | NO   | PRI | <null>  | auto_increment |

| href     | blob         | NO   |     | <null>  |                |

| title    | varchar(255) | NO   |     | <null>  |                |

| contents | blob         | NO   |     | <null>  |                |

+----------+--------------+------+-----+---------+----------------+

enter image description here

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

I'm trying to make web crawler finds external hyperlink of the web page recursively.

With following code, crawler works well but it searches and inserts the link that already saved in database.

I added SELECT query to count rows that have same link but nothing has changed.

What is the problem?

Code:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Get matching rows

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        if select_in_return == 0:

            if link.attrs["href"].startswith("//"):

                cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, scheme + "://" + link.attrs["href"][2:], title, "Temp contents",))



                conn.commit()



            else:

                cur.execute("INSERT INTO internal_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                            (0, link.attrs["href"], title, "Temp contents",))



                conn.commit()





def split_address(addr):

    address_parts = None



    if "https" in addr:

        address_parts = addr.replace("https://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")

    elif "http" in addr:

        address_parts = addr.replace("http://", "").split("/")

        if "www" in address_parts[0]:

            address_parts = address_parts[0].replace("www.", "")



    return address_parts





def get_random_external_links(starting_page):

    html = urlopen(starting_page)



    try:

        bs_obj = BeautifulSoup(html, "html.parser")

    except AttributeError as e:

        return -1



    title = bs_obj.find("title")



    # Get scheme, netloc and title of URI and pass them to add_external_links()

    add_external_links(bs_obj, urlparse(starting_page).scheme, split_address(starting_page)[0], title.get_text())



    cur.execute("SELECT href FROM external_links ORDER BY RAND() LIMIT 1;")

    fetch = cur.fetchall()

    selected_tuple = str(fetch[0][0])



    if selected_tuple.startswith("b'"):

        selected_tuple = selected_tuple[2:]



    if selected_tuple.endswith("'"):

        selected_tuple = selected_tuple[:-1]



    return selected_tuple





def find_random_link(url):

    get_link = get_random_external_link(url)



    if get_link == -1:

        return -1

    else:

        return find_random_link(get_link)

DB "external_links":

+----------+--------------+------+-----+---------+----------------+

| Field    | Type         | Null | Key | Default | Extra          |

+----------+--------------+------+-----+---------+----------------+

| idx      | int(11)      | NO   | PRI | <null>  | auto_increment |

| href     | blob         | NO   |     | <null>  |                |

| title    | varchar(255) | NO   |     | <null>  |                |

| contents | blob         | NO   |     | <null>  |                |

+----------+--------------+------+-----+---------+----------------+

enter image description here

python beautifulsoup

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

edited Jan 20 at 15:06

asked Jan 20 at 10:00

NBlizz

159111

asked Jan 20 at 10:00

NBlizz

159111

asked Jan 20 at 10:00

NBlizz

159111

You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35

can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36

add a comment |

You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35

can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36

You should check if links exist with fetchone(): if select_in_return.fetchone(). It will return None if there are no matches.

– avram
Jan 20 at 10:35

can you please print query after executing it and tell us query in comment below

– dvijparekh
Jan 20 at 10:36

add a comment |

1 Answer
1

active

oldest

votes

This is because of different uri style.

 - https://www.google.com

 - https://google.com

 - http://www.google.com

 - http://google.com

 - //www.google.com

 - //google.com

 - www.google.com

These seven links are same address, but it isn't checked until enter into if select_in_return == 0: block. After executing INSERT INTO query they will become same address, but they are considered as different link while executing SELECT query, so duplicated link is stored.

Solution:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        # Remove protocol(https:// or http:// or //) and host(www.) from URI

        if link.attrs["href"].startswith("//"):

            link.attrs["href"] = link.attrs["href"][2:]

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "https" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("https://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "http" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("http://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "www" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("www.", "")



        # Remove trailing slash

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Reassemble URI

        link.attrs["href"] = scheme + "://" + link.attrs["href"]



        # Get rows matching with URI

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        # Add URI to database if it is not duplicated

        if select_in_return == 0:

            cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                        (0, link.attrs["href"], title, "Temp contents",))

            conn.commit()

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54275320%2fbeautifulsoup-web-crawler-searches-duplicated-links%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

This is because of different uri style.

 - https://www.google.com

 - https://google.com

 - http://www.google.com

 - http://google.com

 - //www.google.com

 - //google.com

 - www.google.com

Solution:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        # Remove protocol(https:// or http:// or //) and host(www.) from URI

        if link.attrs["href"].startswith("//"):

            link.attrs["href"] = link.attrs["href"][2:]

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "https" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("https://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "http" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("http://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "www" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("www.", "")



        # Remove trailing slash

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Reassemble URI

        link.attrs["href"] = scheme + "://" + link.attrs["href"]



        # Get rows matching with URI

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        # Add URI to database if it is not duplicated

        if select_in_return == 0:

            cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                        (0, link.attrs["href"], title, "Temp contents",))

            conn.commit()

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

add a comment |

This is because of different uri style.

 - https://www.google.com

 - https://google.com

 - http://www.google.com

 - http://google.com

 - //www.google.com

 - //google.com

 - www.google.com

Solution:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        # Remove protocol(https:// or http:// or //) and host(www.) from URI

        if link.attrs["href"].startswith("//"):

            link.attrs["href"] = link.attrs["href"][2:]

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "https" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("https://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "http" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("http://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "www" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("www.", "")



        # Remove trailing slash

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Reassemble URI

        link.attrs["href"] = scheme + "://" + link.attrs["href"]



        # Get rows matching with URI

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        # Add URI to database if it is not duplicated

        if select_in_return == 0:

            cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                        (0, link.attrs["href"], title, "Temp contents",))

            conn.commit()

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

add a comment |

This is because of different uri style.

 - https://www.google.com

 - https://google.com

 - http://www.google.com

 - http://google.com

 - //www.google.com

 - //google.com

 - www.google.com

Solution:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        # Remove protocol(https:// or http:// or //) and host(www.) from URI

        if link.attrs["href"].startswith("//"):

            link.attrs["href"] = link.attrs["href"][2:]

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "https" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("https://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "http" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("http://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "www" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("www.", "")



        # Remove trailing slash

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Reassemble URI

        link.attrs["href"] = scheme + "://" + link.attrs["href"]



        # Get rows matching with URI

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        # Add URI to database if it is not duplicated

        if select_in_return == 0:

            cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                        (0, link.attrs["href"], title, "Temp contents",))

            conn.commit()

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

This is because of different uri style.

 - https://www.google.com

 - https://google.com

 - http://www.google.com

 - http://google.com

 - //www.google.com

 - //google.com

 - www.google.com

Solution:

def add_external_links(bs_obj, scheme, exclude_url, title):

    for link in bs_obj.find_all("a", href=re.compile("^(https|http|www|//)((?!" + exclude_url + ").)*$")):

        # Remove protocol(https:// or http:// or //) and host(www.) from URI

        if link.attrs["href"].startswith("//"):

            link.attrs["href"] = link.attrs["href"][2:]

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "https" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("https://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "http" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("http://", "")

            if "www" in link.attrs["href"]:

                link.attrs["href"] = link.attrs["href"].replace("www.", "")

        elif "www" in link.attrs["href"]:

            link.attrs["href"] = link.attrs["href"].replace("www.", "")



        # Remove trailing slash

        if link.attrs["href"].endswith("/"):

            link.attrs["href"] = link.attrs["href"][:-1]



        # Reassemble URI

        link.attrs["href"] = scheme + "://" + link.attrs["href"]



        # Get rows matching with URI

        select_in_return = cur.execute("SELECT * FROM external_links WHERE href=%s;", (link.attrs["href"],))



        # Add URI to database if it is not duplicated

        if select_in_return == 0:

            cur.execute("INSERT INTO external_links (idx, href, title, contents) VALUES (%s, %s, %s, %s);",

                        (0, link.attrs["href"], title, "Temp contents",))

            conn.commit()

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

edited Jan 20 at 15:14

answered Jan 20 at 15:06

NBlizz

159111

answered Jan 20 at 15:06

NBlizz

159111

answered Jan 20 at 15:06

NBlizz

159111

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Brtdku