From d991bdc06e09f349031f6214d481a46fbd4c9c4d Mon Sep 17 00:00:00 2001 From: Vishal Rohra Date: Fri, 6 Jan 2017 23:37:16 +0530 Subject: [PATCH 1/3] Added python section --- g4g.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/g4g.py b/g4g.py index 20b10a3..056a835 100755 --- a/g4g.py +++ b/g4g.py @@ -10,9 +10,9 @@ BASE_URL = 'http://www.geeksforgeeks.org/' articles = [] -choice_to_category = {1: 'c', 2: 'c-plus-plus', 3: 'java', - 4: 'fundamentals-of-algorithms', - 5: 'data-structures'} +choice_to_category = {1: 'c', 2: 'c-plus-plus', 3: 'java', 4: 'python', + 5: 'fundamentals-of-algorithms', + 6: 'data-structures'} def display_menu(): @@ -20,8 +20,9 @@ def display_menu(): print("1. C Language") print("2. C++ Language") print("3. Java") - print("4. Algorithms") - print("5. Data Structures") + print("4. Python") + print("5. Algorithms") + print("6. Data Structures") def get_category_choice(): From 9acb8b5b5924d72ef4bd51398a8213715ddbe51d Mon Sep 17 00:00:00 2001 From: Vishal Rohra Date: Sat, 7 Jan 2017 00:10:22 +0530 Subject: [PATCH 2/3] Added in-page links for html --- g4g.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/g4g.py b/g4g.py index 056a835..9bbab5b 100755 --- a/g4g.py +++ b/g4g.py @@ -69,7 +69,7 @@ def scrape_category(categoryUrl): # Selecting links which are in the category page links = [a.attrs.get('href') for a in soup.select('article li a')] # Removing links for the categories with anchor on same page - links = [link for link in links if not link.startswith('#')] + links = [link.strip() for link in links if not link.startswith('#')] print("Found: " + str(len(links)) + " links") i = 1 @@ -87,7 +87,15 @@ def scrape_category(categoryUrl): [script.extract() for script in link_soup(["script", "ins"])] for code_tag in link_soup.find_all('pre'): code_tag['class'] = code_tag.get('class', []) + ['prettyprint'] + # Change id for article first article = link_soup.find('article') + article["id"] = link.replace("http://geeksforgeeks.org/", "").replace("http://geeksquiz.com/", "").replace("http://www.geeksforgeeks.org/", "") + # Now Change all the link to point to respective id + for a in article.findAll('a'): + try: + a['href'] = a['href'].replace("http://geeksforgeeks.org/", "#").replace("http://geeksquiz.com/", "#").replace("http://www.geeksforgeeks.org/", "#") + except KeyError: + continue # Now add this article to list of all articles articles.append(article.encode('UTF-8')) # Sometimes hanging. So Ctrl ^ C, and try the next link. From 9dd568350d5e3fb403cca8dfd333a3cec3f95a9d Mon Sep 17 00:00:00 2001 From: Vishal Rohra Date: Tue, 7 Feb 2017 08:12:55 +0530 Subject: [PATCH 3/3] Removed in-page links for html --- g4g.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/g4g.py b/g4g.py index 9bbab5b..056a835 100755 --- a/g4g.py +++ b/g4g.py @@ -69,7 +69,7 @@ def scrape_category(categoryUrl): # Selecting links which are in the category page links = [a.attrs.get('href') for a in soup.select('article li a')] # Removing links for the categories with anchor on same page - links = [link.strip() for link in links if not link.startswith('#')] + links = [link for link in links if not link.startswith('#')] print("Found: " + str(len(links)) + " links") i = 1 @@ -87,15 +87,7 @@ def scrape_category(categoryUrl): [script.extract() for script in link_soup(["script", "ins"])] for code_tag in link_soup.find_all('pre'): code_tag['class'] = code_tag.get('class', []) + ['prettyprint'] - # Change id for article first article = link_soup.find('article') - article["id"] = link.replace("http://geeksforgeeks.org/", "").replace("http://geeksquiz.com/", "").replace("http://www.geeksforgeeks.org/", "") - # Now Change all the link to point to respective id - for a in article.findAll('a'): - try: - a['href'] = a['href'].replace("http://geeksforgeeks.org/", "#").replace("http://geeksquiz.com/", "#").replace("http://www.geeksforgeeks.org/", "#") - except KeyError: - continue # Now add this article to list of all articles articles.append(article.encode('UTF-8')) # Sometimes hanging. So Ctrl ^ C, and try the next link.