Crawling Craiglisht with python (Not Scrapy)












1















I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy



This is the URL: https://chicago.craigslist.org/



At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.



import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text

def get_category_list(content):
return category_pat.findall(content)[90:121]

def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result

def get_job_list(content):
result = job_list_pat.findall(content)
return result

def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])

return description


def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)

job_dict = {'jobname': job_name, 'category': category_name}

job_dict['JOBURL'] = job_url

print('scraping', job_name)

content = get_category_list(job_url)

description = get_job_details(content)
job_dict['Description'] = description

print(job_dict)


def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)

for job_info in job_list:
scrape_job_info(job_info, category_name)

next_page = get_next_page(content)

if next_page is None:
break

category_url = next_page


def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)

for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)


if __name__ == '__main__':
url = 'https://chicago.craigslist.org'

response = requests.get(url)

content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')

next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')

job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')

crawl_website()









share|improve this question

























  • Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

    – Life is complex
    Jan 18 at 18:16











  • No, i AM USING CORE PYTHON, NO LIBRARY...

    – pythonerdude
    Jan 18 at 18:28











  • ANY I HAVE FIXED THE ISSUE FINALLY

    – pythonerdude
    Jan 18 at 18:28
















1















I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy



This is the URL: https://chicago.craigslist.org/



At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.



import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text

def get_category_list(content):
return category_pat.findall(content)[90:121]

def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result

def get_job_list(content):
result = job_list_pat.findall(content)
return result

def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])

return description


def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)

job_dict = {'jobname': job_name, 'category': category_name}

job_dict['JOBURL'] = job_url

print('scraping', job_name)

content = get_category_list(job_url)

description = get_job_details(content)
job_dict['Description'] = description

print(job_dict)


def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)

for job_info in job_list:
scrape_job_info(job_info, category_name)

next_page = get_next_page(content)

if next_page is None:
break

category_url = next_page


def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)

for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)


if __name__ == '__main__':
url = 'https://chicago.craigslist.org'

response = requests.get(url)

content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')

next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')

job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')

crawl_website()









share|improve this question

























  • Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

    – Life is complex
    Jan 18 at 18:16











  • No, i AM USING CORE PYTHON, NO LIBRARY...

    – pythonerdude
    Jan 18 at 18:28











  • ANY I HAVE FIXED THE ISSUE FINALLY

    – pythonerdude
    Jan 18 at 18:28














1












1








1








I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy



This is the URL: https://chicago.craigslist.org/



At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.



import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text

def get_category_list(content):
return category_pat.findall(content)[90:121]

def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result

def get_job_list(content):
result = job_list_pat.findall(content)
return result

def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])

return description


def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)

job_dict = {'jobname': job_name, 'category': category_name}

job_dict['JOBURL'] = job_url

print('scraping', job_name)

content = get_category_list(job_url)

description = get_job_details(content)
job_dict['Description'] = description

print(job_dict)


def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)

for job_info in job_list:
scrape_job_info(job_info, category_name)

next_page = get_next_page(content)

if next_page is None:
break

category_url = next_page


def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)

for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)


if __name__ == '__main__':
url = 'https://chicago.craigslist.org'

response = requests.get(url)

content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')

next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')

job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')

crawl_website()









share|improve this question
















I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy



This is the URL: https://chicago.craigslist.org/



At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.



import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text

def get_category_list(content):
return category_pat.findall(content)[90:121]

def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result

def get_job_list(content):
result = job_list_pat.findall(content)
return result

def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])

return description


def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)

job_dict = {'jobname': job_name, 'category': category_name}

job_dict['JOBURL'] = job_url

print('scraping', job_name)

content = get_category_list(job_url)

description = get_job_details(content)
job_dict['Description'] = description

print(job_dict)


def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)

for job_info in job_list:
scrape_job_info(job_info, category_name)

next_page = get_next_page(content)

if next_page is None:
break

category_url = next_page


def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)

for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)


if __name__ == '__main__':
url = 'https://chicago.craigslist.org'

response = requests.get(url)

content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')

next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next &gt; </a>s+<span class="button next" title="next page">s+next &gt;s+</span>s+</span>s+</div>s+</div>s+.+s+.+')

job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')

crawl_website()






python-3.x web-scraping web-crawler craigslist






share|improve this question















share|improve this question













share|improve this question




share|improve this question








edited Jan 18 at 18:49









vezunchik

51139




51139










asked Jan 18 at 17:44









pythonerdudepythonerdude

72




72













  • Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

    – Life is complex
    Jan 18 at 18:16











  • No, i AM USING CORE PYTHON, NO LIBRARY...

    – pythonerdude
    Jan 18 at 18:28











  • ANY I HAVE FIXED THE ISSUE FINALLY

    – pythonerdude
    Jan 18 at 18:28



















  • Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

    – Life is complex
    Jan 18 at 18:16











  • No, i AM USING CORE PYTHON, NO LIBRARY...

    – pythonerdude
    Jan 18 at 18:28











  • ANY I HAVE FIXED THE ISSUE FINALLY

    – pythonerdude
    Jan 18 at 18:28

















Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16





Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc

– Life is complex
Jan 18 at 18:16













No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28





No, i AM USING CORE PYTHON, NO LIBRARY...

– pythonerdude
Jan 18 at 18:28













ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28





ANY I HAVE FIXED THE ISSUE FINALLY

– pythonerdude
Jan 18 at 18:28












1 Answer
1






active

oldest

votes


















0














You had a small string error in your code.



def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])

return description





share|improve this answer























    Your Answer






    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "1"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    autoActivateHeartbeat: false,
    convertImagesToLinks: true,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: 10,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });














    draft saved

    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54259026%2fcrawling-craiglisht-with-python-not-scrapy%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes









    0














    You had a small string error in your code.



    def get_job_details(content):
    # error was caused here
    result = desc_pat.findall(str(content))
    if len(result) == 0:
    description = ''
    else:
    description = str(result[0])

    return description





    share|improve this answer




























      0














      You had a small string error in your code.



      def get_job_details(content):
      # error was caused here
      result = desc_pat.findall(str(content))
      if len(result) == 0:
      description = ''
      else:
      description = str(result[0])

      return description





      share|improve this answer


























        0












        0








        0







        You had a small string error in your code.



        def get_job_details(content):
        # error was caused here
        result = desc_pat.findall(str(content))
        if len(result) == 0:
        description = ''
        else:
        description = str(result[0])

        return description





        share|improve this answer













        You had a small string error in your code.



        def get_job_details(content):
        # error was caused here
        result = desc_pat.findall(str(content))
        if len(result) == 0:
        description = ''
        else:
        description = str(result[0])

        return description






        share|improve this answer












        share|improve this answer



        share|improve this answer










        answered Jan 18 at 18:31









        Life is complexLife is complex

        249212




        249212






























            draft saved

            draft discarded




















































            Thanks for contributing an answer to Stack Overflow!


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.




            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54259026%2fcrawling-craiglisht-with-python-not-scrapy%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            Popular posts from this blog

            How fix org.hibernate.TransientPropertyValueException

            Updating UILabel text programmatically using a function

            Cloud Functions - OpenCV Videocapture Read method fails for larger files from cloud storage