Crawling Craiglisht with python (Not Scrapy)
I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')
next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next > </a>s+<span class="button next" title="next page">s+next >s+</span>s+</span>s+</div>s+</div>s+.+s+.+')
job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')
crawl_website()
python-3.x web-scraping web-crawler craigslist
add a comment |
I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')
next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next > </a>s+<span class="button next" title="next page">s+next >s+</span>s+</span>s+</div>s+</div>s+.+s+.+')
job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')
crawl_website()
python-3.x web-scraping web-crawler craigslist
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28
add a comment |
I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')
next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next > </a>s+<span class="button next" title="next page">s+next >s+</span>s+</span>s+</div>s+</div>s+.+s+.+')
job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')
crawl_website()
python-3.x web-scraping web-crawler craigslist
I am trying to crawl Craglist jobs using python (I am not using scrapy) Can anyone please solve below this code? plese dont talk about scrapy
This is the URL: https://chicago.craigslist.org/
At first i am extracting job category, then job listing, then job details, also written code to crawl next page too.
import re
import requests
import csv
from html import unescape
def get_page_content(url):
response = requests.get(url)
return response.text
def get_category_list(content):
return category_pat.findall(content)[90:121]
def get_next_page(content):
result = next_page_pat.findall(content)
if len(result) == 0:
return None
else:
result = 'https://chicago.craigslist.org/' + result[0]
return result
def get_job_list(content):
result = job_list_pat.findall(content)
return result
def get_job_details(content):
result = desc_pat.findall(content)
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
def scrape_job_info(job_info, category_name):
job_url, job_name = job_info
job_name = unescape(job_name)
job_dict = {'jobname': job_name, 'category': category_name}
job_dict['JOBURL'] = job_url
print('scraping', job_name)
content = get_category_list(job_url)
description = get_job_details(content)
job_dict['Description'] = description
print(job_dict)
def crawl_category(category_name, category_url):
while True:
print(category_url)
content = get_page_content(category_url)
job_list = get_job_list(content)
print(job_list)
for job_info in job_list:
scrape_job_info(job_info, category_name)
next_page = get_next_page(content)
if next_page is None:
break
category_url = next_page
def crawl_website():
url = 'https://chicago.craigslist.org'
content = get_page_content(url)
category_list = get_category_list(content)
for category in category_list:
category_url, category_name = category
category_url = url + category_url
crawl_category(category_name, category_url)
if __name__ == '__main__':
url = 'https://chicago.craigslist.org'
response = requests.get(url)
content = response.text
category_pat = re.compile(r'<li><a href="(/d/[w-]+/w+/w+)".+txt">([w-+s+/<]+)<sup class')
next_page_pat = re.compile(
r'<a href="/(.*)" class="button next" title="nexts+page">next > </a>s+<span class="button next" title="next page">s+next >s+</span>s+</span>s+</div>s+</div>s+.+s+.+')
job_list_pat = re.compile(r'<a href="(https://w+.craigslist.org/chc/.+html)".+hdrlnk">([ws*]+)</a>')
desc_pat = re.compile(r'</div>s*<section id="postingbody">.+html"></div>s*</div>(.+)</section><ul')
img_pat = re.compile(r'<img src="(.*jpg)" title')
crawl_website()
python-3.x web-scraping web-crawler craigslist
python-3.x web-scraping web-crawler craigslist
edited Jan 18 at 18:49
vezunchik
51139
51139
asked Jan 18 at 17:44
pythonerdudepythonerdude
72
72
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28
add a comment |
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28
add a comment |
1 Answer
1
active
oldest
votes
You had a small string error in your code.
def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
add a comment |
Your Answer
StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");
StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);
StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});
function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});
}
});
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54259026%2fcrawling-craiglisht-with-python-not-scrapy%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
You had a small string error in your code.
def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
add a comment |
You had a small string error in your code.
def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
add a comment |
You had a small string error in your code.
def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
You had a small string error in your code.
def get_job_details(content):
# error was caused here
result = desc_pat.findall(str(content))
if len(result) == 0:
description = ''
else:
description = str(result[0])
return description
answered Jan 18 at 18:31
Life is complexLife is complex
249212
249212
add a comment |
add a comment |
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f54259026%2fcrawling-craiglisht-with-python-not-scrapy%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Have you considered using this? crummy.com/software/BeautifulSoup/bs4/doc
– Life is complex
Jan 18 at 18:16
No, i AM USING CORE PYTHON, NO LIBRARY...
– pythonerdude
Jan 18 at 18:28
ANY I HAVE FIXED THE ISSUE FINALLY
– pythonerdude
Jan 18 at 18:28