X-Git-Url: https://git.lizzy.rs/?a=blobdiff_plain;f=nhentai%2Fparser.py;h=c61f8f71515bdf0d3a0fc2dd1b567d6b92bb9341;hb=f157ac32469231a01823fd14b17870fb4e6b44e8;hp=9b9232b3c7674df27429e23bf4c05b14fccea6c3;hpb=3017fff823a9fded34bfced4841d7287eb2204cf;p=nhentai.git diff --git a/nhentai/parser.py b/nhentai/parser.py index 9b9232b..c61f8f7 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -1,10 +1,9 @@ # coding: utf-8 from __future__ import unicode_literals, print_function +import sys import os import re -import threadpool -import requests import time from bs4 import BeautifulSoup from tabulate import tabulate @@ -65,7 +64,7 @@ def _get_title_and_id(response): return result -def favorites_parser(): +def favorites_parser(page=None): result = [] html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser') count = html.find('span', attrs={'class': 'count'}) @@ -79,17 +78,22 @@ def favorites_parser(): return [] pages = int(count / 25) - if pages: - pages += 1 if count % (25 * pages) else 0 + if page: + page_range_list = page else: - pages = 1 + if pages: + pages += 1 if count % (25 * pages) else 0 + else: + pages = 1 - logger.info('You have %d favorites in %d pages.' % (count, pages)) + logger.info('You have %d favorites in %d pages.' % (count, pages)) - if os.getenv('DEBUG'): - pages = 1 + if os.getenv('DEBUG'): + pages = 1 - for page in range(1, pages + 1): + page_range_list = range(1, pages + 1) + + for page in page_range_list: try: logger.info('Getting doujinshi ids of page %d' % page) resp = request('get', constant.FAV_URL + '?page=%d' % page).content @@ -113,7 +117,7 @@ def doujinshi_parser(id_): try: response = request('get', url) - if response.status_code in (200, ): + if response.status_code in (200,): response = response.content else: logger.debug('Slow down and retry ({}) ...'.format(id_)) @@ -134,7 +138,7 @@ def doujinshi_parser(id_): doujinshi['subtitle'] = subtitle.text if subtitle else '' doujinshi_cover = html.find('div', attrs={'id': 'cover'}) - img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png)$', doujinshi_cover.a.img.attrs['data-src']) + img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png|gif)$', doujinshi_cover.a.img.attrs['data-src']) ext = [] for i in html.find_all('div', attrs={'class': 'thumb-container'}): @@ -148,28 +152,28 @@ def doujinshi_parser(id_): doujinshi['img_id'] = img_id.group(1) doujinshi['ext'] = ext - pages = 0 - for _ in doujinshi_info.find_all('div', class_=''): - pages = re.search('([\d]+) pages', _.text) - if pages: - pages = pages.group(1) - break + for _ in doujinshi_info.find_all('div', class_='tag-container field-name'): + if re.search('Pages:', _.text): + pages = _.find('span', class_='name').string doujinshi['pages'] = int(pages) # gain information of the doujinshi information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'}) - needed_fields = ['Characters', 'Artists', 'Languages', 'Tags'] + needed_fields = ['Characters', 'Artists', 'Languages', 'Tags', 'Parodies', 'Groups', 'Categories'] for field in information_fields: field_name = field.contents[0].strip().strip(':') if field_name in needed_fields: - data = [sub_field.contents[0].strip() for sub_field in + data = [sub_field.find('span', attrs={'class': 'name'}).contents[0].strip() for sub_field in field.find_all('a', attrs={'class': 'tag'})] doujinshi[field_name.lower()] = ', '.join(data) + time_field = doujinshi_info.find('time') + if time_field.has_attr('datetime'): + doujinshi['date'] = time_field['datetime'] return doujinshi -def search_parser(keyword, sorting='date', page=1): +def old_search_parser(keyword, sorting='date', page=1): logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content @@ -185,82 +189,48 @@ def print_doujinshi(doujinshi_list): return doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list] headers = ['id', 'doujinshi'] - logger.info('Search Result\n' + + logger.info('Search Result || Found %i doujinshis \n' % doujinshi_list.__len__() + tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst')) -def tag_parser(tag_name, sorting='date', max_page=1): +def search_parser(keyword, sorting, page, is_page_all=False): + # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) result = [] - tag_name = tag_name.lower() - tag_name = tag_name.replace(' ', '-') - - if sorting == 'date': - sorting = '' - - for p in range(1, max_page + 1): - logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format(p, tag_name)) - response = request('get', url='%s/%s/%s?page=%d' % (constant.TAG_URL, tag_name, sorting, p)).content - - result += _get_title_and_id(response) - if not result: - logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) - return - - if not result: - logger.warn('No results for tag \'{}\''.format(tag_name)) - - return result - - -def __api_suspended_search_parser(keyword, sorting, page): - logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) - result = [] - i = 0 - while i < 5: - try: - response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page, 'sort': sorting}).json() - except Exception as e: - i += 1 - if not i < 5: + if not page: + page = [1] + + if is_page_all: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword}).url + init_response = request('get', url.replace('%2B', '+')).json() + page = range(1, init_response['num_pages']+1) + + for p in page: + i = 0 + if is_page_all: + total = '/{0}'.format(page[-1]) + + logger.info('Searching doujinshis using keywords "{0}" on page {1}{2}'.format(keyword, p, total)) + while i < 3: + try: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword, + 'page': p, 'sort': sorting}).url + response = request('get', url.replace('%2B', '+')).json() + except Exception as e: logger.critical(str(e)) - logger.warn('If you are in China, please configure the proxy to fu*k GFW.') - exit(1) - continue - break - - if 'result' not in response: - raise Exception('No result in response') - - for row in response['result']: - title = row['title']['english'] - title = title[:85] + '..' if len(title) > 85 else title - result.append({'id': row['id'], 'title': title}) - - if not result: - logger.warn('No results for keywords {}'.format(keyword)) - - return result + break -def __api_suspended_tag_parser(tag_id, sorting, max_page=1): - logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) - result = [] - response = request('get', url=constant.TAG_API_URL, params={'sort': sorting, 'tag_id': tag_id}).json() - page = max_page if max_page <= response['num_pages'] else int(response['num_pages']) - - for i in range(1, page + 1): - logger.info('Getting page {} ...'.format(i)) + if 'result' not in response: + logger.warn('No result in response in page {}'.format(p)) + break - if page != 1: - response = request('get', url=constant.TAG_API_URL, - params={'sort': sorting, 'tag_id': tag_id}).json() - for row in response['result']: - title = row['title']['english'] - title = title[:85] + '..' if len(title) > 85 else title - result.append({'id': row['id'], 'title': title}) + for row in response['result']: + title = row['title']['english'] + title = title[:85] + '..' if len(title) > 85 else title + result.append({'id': row['id'], 'title': title}) - if not result: - logger.warn('No results for tag id {}'.format(tag_id)) + if not result: + logger.warn('No results for keywords {}'.format(keyword)) return result @@ -289,11 +259,11 @@ def __api_suspended_doujinshi_parser(id_): doujinshi['name'] = response['title']['english'] doujinshi['subtitle'] = response['title']['japanese'] doujinshi['img_id'] = response['media_id'] - doujinshi['ext'] = ''.join(map(lambda s: s['t'], response['images']['pages'])) + doujinshi['ext'] = ''.join([i['t'] for i in response['images']['pages']]) doujinshi['pages'] = len(response['images']['pages']) # gain information of the doujinshi - needed_fields = ['character', 'artist', 'language', 'tag'] + needed_fields = ['character', 'artist', 'language', 'tag', 'parody', 'group', 'category'] for tag in response['tags']: tag_type = tag['type'] if tag_type in needed_fields: