Crawling Craiglisht with python (Not Scrapy)

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy

This is the URL: https://chicago.craigslist.org/

At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.

import re

import requests

import csv

from html import unescape

def get_page_content(url):

    response = requests.get(url)

    return response.text



def get_category_list(content):

    return category_pat.findall(content)[90:121]



def get_next_page(content):

    result = next_page_pat.findall(content)

    if len(result) == 0:

        return None

    else:

        result = 'https://chicago.craigslist.org/' + result[0]

        return result



def get_job_list(content):

    result = job_list_pat.findall(content)

    return result



def get_job_details(content):

    result = desc_pat.findall(content)

    if len(result) == 0:

        description = ''

    else:

        description = str(result[0])



    return description





def scrape_job_info(job_info, category_name):

    job_url, job_name = job_info

    job_name = unescape(job_name)



    job_dict = {'jobname': job_name, 'category': category_name}



    job_dict['JOBURL'] = job_url



    print('scraping', job_name)



    content = get_category_list(job_url)



    description = get_job_details(content)

    job_dict['Description'] = description



    print(job_dict)





def crawl_category(category_name, category_url):

    while True:

        print(category_url)

        content = get_page_content(category_url)

        job_list = get_job_list(content)

        print(job_list)



        for job_info in job_list:

            scrape_job_info(job_info, category_name)



        next_page = get_next_page(content)



        if next_page is None:

            break



        category_url = next_page





def crawl_website():

    url = 'https://chicago.craigslist.org'

    content = get_page_content(url)

    category_list = get_category_list(content)



    for category in category_list:

        category_url, category_name = category

        category_url = url + category_url

        crawl_category(category_name, category_url)





if __name__ == '__main__':

    url = 'https://chicago.craigslist.org'



    response = requests.get(url)



    content = response.text

    category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')



    next_page_pat = re.compile(

        r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')



    job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')

    desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')

    img_pat = re.compile(r'<img src="(.*jpg)" title')



    crawl_website()

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16

No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28

ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28

add a comment |

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy

This is the URL: https://chicago.craigslist.org/

At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.

import re

import requests

import csv

from html import unescape

def get_page_content(url):

    response = requests.get(url)

    return response.text



def get_category_list(content):

    return category_pat.findall(content)[90:121]



def get_next_page(content):

    result = next_page_pat.findall(content)

    if len(result) == 0:

        return None

    else:

        result = 'https://chicago.craigslist.org/' + result[0]

        return result



def get_job_list(content):

    result = job_list_pat.findall(content)

    return result



def get_job_details(content):

    result = desc_pat.findall(content)

    if len(result) == 0:

        description = ''

    else:

        description = str(result[0])



    return description





def scrape_job_info(job_info, category_name):

    job_url, job_name = job_info

    job_name = unescape(job_name)



    job_dict = {'jobname': job_name, 'category': category_name}



    job_dict['JOBURL'] = job_url



    print('scraping', job_name)



    content = get_category_list(job_url)



    description = get_job_details(content)

    job_dict['Description'] = description



    print(job_dict)





def crawl_category(category_name, category_url):

    while True:

        print(category_url)

        content = get_page_content(category_url)

        job_list = get_job_list(content)

        print(job_list)



        for job_info in job_list:

            scrape_job_info(job_info, category_name)



        next_page = get_next_page(content)



        if next_page is None:

            break



        category_url = next_page





def crawl_website():

    url = 'https://chicago.craigslist.org'

    content = get_page_content(url)

    category_list = get_category_list(content)



    for category in category_list:

        category_url, category_name = category

        category_url = url + category_url

        crawl_category(category_name, category_url)





if __name__ == '__main__':

    url = 'https://chicago.craigslist.org'



    response = requests.get(url)



    content = response.text

    category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')



    next_page_pat = re.compile(

        r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')



    job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')

    desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')

    img_pat = re.compile(r'<img src="(.*jpg)" title')



    crawl_website()

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16

No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28

ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28

add a comment |

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy

This is the URL: https://chicago.craigslist.org/

At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.

import re

import requests

import csv

from html import unescape

def get_page_content(url):

    response = requests.get(url)

    return response.text



def get_category_list(content):

    return category_pat.findall(content)[90:121]



def get_next_page(content):

    result = next_page_pat.findall(content)

    if len(result) == 0:

        return None

    else:

        result = 'https://chicago.craigslist.org/' + result[0]

        return result



def get_job_list(content):

    result = job_list_pat.findall(content)

    return result



def get_job_details(content):

    result = desc_pat.findall(content)

    if len(result) == 0:

        description = ''

    else:

        description = str(result[0])



    return description





def scrape_job_info(job_info, category_name):

    job_url, job_name = job_info

    job_name = unescape(job_name)



    job_dict = {'jobname': job_name, 'category': category_name}



    job_dict['JOBURL'] = job_url



    print('scraping', job_name)



    content = get_category_list(job_url)



    description = get_job_details(content)

    job_dict['Description'] = description



    print(job_dict)





def crawl_category(category_name, category_url):

    while True:

        print(category_url)

        content = get_page_content(category_url)

        job_list = get_job_list(content)

        print(job_list)



        for job_info in job_list:

            scrape_job_info(job_info, category_name)



        next_page = get_next_page(content)



        if next_page is None:

            break



        category_url = next_page





def crawl_website():

    url = 'https://chicago.craigslist.org'

    content = get_page_content(url)

    category_list = get_category_list(content)



    for category in category_list:

        category_url, category_name = category

        category_url = url + category_url

        crawl_category(category_name, category_url)





if __name__ == '__main__':

    url = 'https://chicago.craigslist.org'



    response = requests.get(url)



    content = response.text

    category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')



    next_page_pat = re.compile(

        r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')



    job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')

    desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')

    img_pat = re.compile(r'<img src="(.*jpg)" title')



    crawl_website()

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy

This is the URL: https://chicago.craigslist.org/

At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.

import re

import requests

import csv

from html import unescape

def get_page_content(url):

    response = requests.get(url)

    return response.text



def get_category_list(content):

    return category_pat.findall(content)[90:121]



def get_next_page(content):

    result = next_page_pat.findall(content)

    if len(result) == 0:

        return None

    else:

        result = 'https://chicago.craigslist.org/' + result[0]

        return result



def get_job_list(content):

    result = job_list_pat.findall(content)

    return result



def get_job_details(content):

    result = desc_pat.findall(content)

    if len(result) == 0:

        description = ''

    else:

        description = str(result[0])



    return description





def scrape_job_info(job_info, category_name):

    job_url, job_name = job_info

    job_name = unescape(job_name)



    job_dict = {'jobname': job_name, 'category': category_name}



    job_dict['JOBURL'] = job_url



    print('scraping', job_name)



    content = get_category_list(job_url)



    description = get_job_details(content)

    job_dict['Description'] = description



    print(job_dict)





def crawl_category(category_name, category_url):

    while True:

        print(category_url)

        content = get_page_content(category_url)

        job_list = get_job_list(content)

        print(job_list)



        for job_info in job_list:

            scrape_job_info(job_info, category_name)



        next_page = get_next_page(content)



        if next_page is None:

            break



        category_url = next_page





def crawl_website():

    url = 'https://chicago.craigslist.org'

    content = get_page_content(url)

    category_list = get_category_list(content)



    for category in category_list:

        category_url, category_name = category

        category_url = url + category_url

        crawl_category(category_name, category_url)





if __name__ == '__main__':

    url = 'https://chicago.craigslist.org'



    response = requests.get(url)



    content = response.text

    category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')



    next_page_pat = re.compile(

        r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')



    job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')

    desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')

    img_pat = re.compile(r'<img src="(.*jpg)" title')



    crawl_website()

python-3.x web-scraping web-crawler craigslist

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

edited Jan 18 at 18:49

vezunchik

51139

edited Jan 18 at 18:49

vezunchik

51139

edited Jan 18 at 18:49

vezunchik

51139

asked Jan 18 at 17:44

pythonerdude

asked Jan 18 at 17:44

pythonerdude

asked Jan 18 at 17:44

pythonerdude

Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16

No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28

ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28

add a comment |

Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16

No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28

ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28

Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16

No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28

ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28

add a comment |

1 Answer
1

active

oldest

votes

You had a small string error in your code.

def get_job_details(content):

  # error was caused here

  result = desc_pat.findall(str(content))

  if len(result) == 0:

    description = ''

  else:

    description = str(result[0])



  return description

answered Jan 18 at 18:31

Life is complex

249212

add a comment |

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54259026%2fcrawling-craiglisht-with-python-not-scrapy%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

1 Answer
1

active

oldest

votes

1 Answer
1

active

oldest

votes

You had a small string error in your code.

def get_job_details(content):

  # error was caused here

  result = desc_pat.findall(str(content))

  if len(result) == 0:

    description = ''

  else:

    description = str(result[0])



  return description

answered Jan 18 at 18:31

Life is complex

249212

add a comment |

You had a small string error in your code.

def get_job_details(content):

  # error was caused here

  result = desc_pat.findall(str(content))

  if len(result) == 0:

    description = ''

  else:

    description = str(result[0])



  return description

answered Jan 18 at 18:31

Life is complex

249212

add a comment |

You had a small string error in your code.

def get_job_details(content):

  # error was caused here

  result = desc_pat.findall(str(content))

  if len(result) == 0:

    description = ''

  else:

    description = str(result[0])



  return description

answered Jan 18 at 18:31

Life is complex

249212

You had a small string error in your code.

def get_job_details(content):

  # error was caused here

  result = desc_pat.findall(str(content))

  if len(result) == 0:

    description = ''

  else:

    description = str(result[0])



  return description

answered Jan 18 at 18:31

Life is complex

249212

answered Jan 18 at 18:31

Life is complex

249212

answered Jan 18 at 18:31

Life is complex

249212

answered Jan 18 at 18:31

Life is complex

249212

add a comment |

draft saved

draft discarded

Thanks for contributing an answer to Stack Overflow!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Brtdku