X-Git-Url: https://git.lizzy.rs/?a=blobdiff_plain;f=nhentai%2Fparser.py;h=7b9142cd3232b9cead8337a96e13e06471573e05;hb=ba141efba7d31cf46fa90da7802ca59a54319ff3;hp=97c1b159567860c9a7dd48b7111cbbaa300040e8;hpb=8ed1b8927724d70ea6eb50462275ca9b69e645ea;p=nhentai.git diff --git a/nhentai/parser.py b/nhentai/parser.py index 97c1b15..7b9142c 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -1,7 +1,5 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function -import sys import os import re import time @@ -64,7 +62,7 @@ def _get_title_and_id(response): return result -def favorites_parser(page_range=''): +def favorites_parser(page=None): result = [] html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser') count = html.find('span', attrs={'class': 'count'}) @@ -78,20 +76,20 @@ def favorites_parser(page_range=''): return [] pages = int(count / 25) - if pages: - pages += 1 if count % (25 * pages) else 0 + if page: + page_range_list = page else: - pages = 1 + if pages: + pages += 1 if count % (25 * pages) else 0 + else: + pages = 1 - logger.info('You have %d favorites in %d pages.' % (count, pages)) + logger.info('You have %d favorites in %d pages.' % (count, pages)) - if os.getenv('DEBUG'): - pages = 1 + if os.getenv('DEBUG'): + pages = 1 - page_range_list = range(1, pages + 1) - if page_range: - logger.info('page range is {0}'.format(page_range)) - page_range_list = page_range_parser(page_range, pages) + page_range_list = range(1, pages + 1) for page in page_range_list: try: @@ -105,32 +103,6 @@ def favorites_parser(page_range=''): return result -def page_range_parser(page_range, max_page_num): - pages = set() - ranges = str.split(page_range, ',') - for range_str in ranges: - idx = range_str.find('-') - if idx == -1: - try: - page = int(range_str) - if page <= max_page_num: - pages.add(page) - except ValueError: - logger.error('page range({0}) is not valid'.format(page_range)) - else: - try: - left = int(range_str[:idx]) - right = int(range_str[idx + 1:]) - if right > max_page_num: - right = max_page_num - for page in range(left, right + 1): - pages.add(page) - except ValueError: - logger.error('page range({0}) is not valid'.format(page_range)) - - return list(pages) - - def doujinshi_parser(id_): if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()): raise Exception('Doujinshi id({0}) is not valid'.format(id_)) @@ -143,8 +115,11 @@ def doujinshi_parser(id_): try: response = request('get', url) - if response.status_code in (200,): + if response.status_code in (200, ): response = response.content + elif response.status_code in (404,): + logger.error("Doujinshi with id {0} cannot be found".format(id_)) + return [] else: logger.debug('Slow down and retry ({}) ...'.format(id_)) time.sleep(1) @@ -178,7 +153,6 @@ def doujinshi_parser(id_): doujinshi['img_id'] = img_id.group(1) doujinshi['ext'] = ext - pages = 0 for _ in doujinshi_info.find_all('div', class_='tag-container field-name'): if re.search('Pages:', _.text): pages = _.find('span', class_='name').string @@ -216,38 +190,47 @@ def print_doujinshi(doujinshi_list): return doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list] headers = ['id', 'doujinshi'] - logger.info('Search Result\n' + + logger.info('Search Result || Found %i doujinshis \n' % doujinshi_list.__len__() + tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst')) -def search_parser(keyword, sorting, page): - logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) - keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) +def search_parser(keyword, sorting, page, is_page_all=False): + # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) result = [] - i = 0 - while i < 5: - try: - url = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page, 'sort': sorting}).url - response = request('get', url.replace('%2B', '+')).json() - except Exception as e: - i += 1 - if not i < 5: + if not page: + page = [1] + + if is_page_all: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword}).url + init_response = request('get', url.replace('%2B', '+')).json() + page = range(1, init_response['num_pages']+1) + + total = '/{0}'.format(page[-1]) if is_page_all else '' + for p in page: + i = 0 + + logger.info('Searching doujinshis using keywords "{0}" on page {1}{2}'.format(keyword, p, total)) + while i < 3: + try: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword, + 'page': p, 'sort': sorting}).url + response = request('get', url.replace('%2B', '+')).json() + except Exception as e: logger.critical(str(e)) - logger.warn('If you are in China, please configure the proxy to fu*k GFW.') - exit(1) - continue - break - if 'result' not in response: - raise Exception('No result in response') + break - for row in response['result']: - title = row['title']['english'] - title = title[:85] + '..' if len(title) > 85 else title - result.append({'id': row['id'], 'title': title}) + if 'result' not in response: + logger.warn('No result in response in page {}'.format(p)) + break - if not result: - logger.warn('No results for keywords {}'.format(keyword)) + for row in response['result']: + title = row['title']['english'] + title = title[:85] + '..' if len(title) > 85 else title + result.append({'id': row['id'], 'title': title}) + + if not result: + logger.warn('No results for keywords {}'.format(keyword)) return result