X-Git-Url: https://git.lizzy.rs/?a=blobdiff_plain;f=nhentai%2Fparser.py;h=7b9142cd3232b9cead8337a96e13e06471573e05;hb=ba141efba7d31cf46fa90da7802ca59a54319ff3;hp=6f5323a3eae7ee4750c382d2befebabdcac2db5b;hpb=0df51c83e57b3a72bd9c5494ea421fb4e7d1c5ca;p=nhentai.git diff --git a/nhentai/parser.py b/nhentai/parser.py index 6f5323a..7b9142c 100644 --- a/nhentai/parser.py +++ b/nhentai/parser.py @@ -1,54 +1,74 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function import os import re -import threadpool -import requests import time from bs4 import BeautifulSoup from tabulate import tabulate import nhentai.constant as constant +from nhentai.utils import request from nhentai.logger import logger -def request(method, url, **kwargs): - if not hasattr(requests, method): - raise AttributeError('\'requests\' object has no attribute \'{0}\''.format(method)) - - return requests.__dict__[method](url, proxies=constant.PROXY, verify=False, **kwargs) - - -def login_parser(username, password): - s = requests.Session() - s.proxies = constant.PROXY - s.verify = False - s.headers.update({'Referer': constant.LOGIN_URL}) - - s.get(constant.LOGIN_URL) - content = s.get(constant.LOGIN_URL).content +def _get_csrf_token(content): html = BeautifulSoup(content, 'html.parser') csrf_token_elem = html.find('input', attrs={'name': 'csrfmiddlewaretoken'}) - if not csrf_token_elem: raise Exception('Cannot find csrf token to login') - csrf_token = csrf_token_elem.attrs['value'] + return csrf_token_elem.attrs['value'] + + +def login(username, password): + logger.warning('This feature is deprecated, please use --cookie to set your cookie.') + csrf_token = _get_csrf_token(request('get', url=constant.LOGIN_URL).text) + if os.getenv('DEBUG'): + logger.info('Getting CSRF token ...') + + if os.getenv('DEBUG'): + logger.info('CSRF token is {}'.format(csrf_token)) login_dict = { 'csrfmiddlewaretoken': csrf_token, 'username_or_email': username, 'password': password, } - resp = s.post(constant.LOGIN_URL, data=login_dict) + resp = request('post', url=constant.LOGIN_URL, data=login_dict) + + if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text: + csrf_token = _get_csrf_token(resp.text) + resp = request('post', url=resp.url, data={'csrfmiddlewaretoken': csrf_token, 'next': '/'}) + if 'Invalid username/email or password' in resp.text: logger.error('Login failed, please check your username and password') exit(1) - html = BeautifulSoup(s.get(constant.FAV_URL).content, 'html.parser') + if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text: + logger.error('Using nhentai --cookie \'YOUR_COOKIE_HERE\' to save your Cookie.') + exit(2) + + +def _get_title_and_id(response): + result = [] + html = BeautifulSoup(response, 'html.parser') + doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'}) + for doujinshi in doujinshi_search_result: + doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'}) + title = doujinshi_container.text.strip() + title = title if len(title) < 85 else title[:82] + '...' + id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1) + result.append({'id': id_, 'title': title}) + + return result + + +def favorites_parser(page=None): + result = [] + html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser') count = html.find('span', attrs={'class': 'count'}) if not count: logger.error("Can't get your number of favorited doujins. Did the login failed?") + return [] count = int(count.text.strip('(').strip(')').replace(',', '')) if count == 0: @@ -56,36 +76,31 @@ def login_parser(username, password): return [] pages = int(count / 25) - if pages: - pages += 1 if count % (25 * pages) else 0 + if page: + page_range_list = page else: - pages = 1 - - logger.info('You have %d favorites in %d pages.' % (count, pages)) - - if os.getenv('DEBUG'): - pages = 1 + if pages: + pages += 1 if count % (25 * pages) else 0 + else: + pages = 1 - ret = [] - doujinshi_id = re.compile('data-id="([\d]+)"') + logger.info('You have %d favorites in %d pages.' % (count, pages)) - def _callback(request, result): - ret.append(result) + if os.getenv('DEBUG'): + pages = 1 - thread_pool = threadpool.ThreadPool(5) + page_range_list = range(1, pages + 1) - for page in range(1, pages+1): + for page in page_range_list: try: logger.info('Getting doujinshi ids of page %d' % page) - resp = s.get(constant.FAV_URL + '?page=%d' % page).text - ids = doujinshi_id.findall(resp) - requests_ = threadpool.makeRequests(doujinshi_parser, ids, _callback) - [thread_pool.putRequest(req) for req in requests_] - thread_pool.wait() + resp = request('get', constant.FAV_URL + '?page=%d' % page).content + + result.extend(_get_title_and_id(resp)) except Exception as e: logger.error('Error: %s, continue', str(e)) - return ret + return result def doujinshi_parser(id_): @@ -93,74 +108,79 @@ def doujinshi_parser(id_): raise Exception('Doujinshi id({0}) is not valid'.format(id_)) id_ = int(id_) - logger.log(15, 'Fetching information of doujinshi id {0}'.format(id_)) + logger.log(15, 'Fetching doujinshi information of id {0}'.format(id_)) doujinshi = dict() doujinshi['id'] = id_ - url = '{0}/{1}'.format(constant.DETAIL_URL, id_) - i=0 - while i<5: - try: - response = request('get', url).json() - except Exception as e: - i+=1 - if not i<5: - logger.critical(str(e)) - exit(1) - continue - break + url = '{0}/{1}/'.format(constant.DETAIL_URL, id_) + + try: + response = request('get', url) + if response.status_code in (200, ): + response = response.content + elif response.status_code in (404,): + logger.error("Doujinshi with id {0} cannot be found".format(id_)) + return [] + else: + logger.debug('Slow down and retry ({}) ...'.format(id_)) + time.sleep(1) + return doujinshi_parser(str(id_)) + + except Exception as e: + logger.warn('Error: {}, ignored'.format(str(e))) + return None - doujinshi['name'] = response['title']['english'] - doujinshi['subtitle'] = response['title']['japanese'] - doujinshi['img_id'] = response['media_id'] - doujinshi['ext'] = ''.join(map(lambda s: s['t'], response['images']['pages'])) - doujinshi['pages'] = len(response['images']['pages']) + html = BeautifulSoup(response, 'html.parser') + doujinshi_info = html.find('div', attrs={'id': 'info'}) - # gain information of the doujinshi - needed_fields = ['character', 'artist', 'language', 'tag'] - for tag in response['tags']: - tag_type = tag['type'] - if tag_type in needed_fields: - if tag_type == 'tag': - if tag_type not in doujinshi: - doujinshi[tag_type] = {} + title = doujinshi_info.find('h1').text + subtitle = doujinshi_info.find('h2') - tag['name'] = tag['name'].replace(' ', '-') - tag['name'] = tag['name'].lower() - doujinshi[tag_type][tag['name']] = tag['id'] - elif tag_type not in doujinshi: - doujinshi[tag_type] = tag['name'] - else: - doujinshi[tag_type] += ', ' + tag['name'] + doujinshi['name'] = title + doujinshi['subtitle'] = subtitle.text if subtitle else '' - return doujinshi + doujinshi_cover = html.find('div', attrs={'id': 'cover'}) + img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png|gif)$', doujinshi_cover.a.img.attrs['data-src']) + ext = [] + for i in html.find_all('div', attrs={'class': 'thumb-container'}): + _, ext_name = os.path.basename(i.img.attrs['data-src']).rsplit('.', 1) + ext.append(ext_name) -def search_parser(keyword, page): - logger.debug('Searching doujinshis using keywords {0}'.format(keyword)) - result = [] - i=0 - while i<5: - try: - response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page}).json() - except Exception as e: - i+=1 - if not i<5: - logger.critical(str(e)) - logger.warn('If you are in China, please configure the proxy to fu*k GFW.') - exit(1) - continue - break + if not img_id: + logger.critical('Tried yo get image id failed') + exit(1) + + doujinshi['img_id'] = img_id.group(1) + doujinshi['ext'] = ext - if 'result' not in response: - raise Exception('No result in response') + for _ in doujinshi_info.find_all('div', class_='tag-container field-name'): + if re.search('Pages:', _.text): + pages = _.find('span', class_='name').string + doujinshi['pages'] = int(pages) - for row in response['result']: - title = row['title']['english'] - title = title[:85] + '..' if len(title) > 85 else title - result.append({'id': row['id'], 'title': title}) + # gain information of the doujinshi + information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'}) + needed_fields = ['Characters', 'Artists', 'Languages', 'Tags', 'Parodies', 'Groups', 'Categories'] + for field in information_fields: + field_name = field.contents[0].strip().strip(':') + if field_name in needed_fields: + data = [sub_field.find('span', attrs={'class': 'name'}).contents[0].strip() for sub_field in + field.find_all('a', attrs={'class': 'tag'})] + doujinshi[field_name.lower()] = ', '.join(data) + + time_field = doujinshi_info.find('time') + if time_field.has_attr('datetime'): + doujinshi['date'] = time_field['datetime'] + return doujinshi + +def old_search_parser(keyword, sorting='date', page=1): + logger.debug('Searching doujinshis of keyword {0}'.format(keyword)) + response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content + + result = _get_title_and_id(response) if not result: - logger.warn('No results for keywords {}'.format(keyword)) + logger.warn('Not found anything of keyword {}'.format(keyword)) return result @@ -170,88 +190,96 @@ def print_doujinshi(doujinshi_list): return doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list] headers = ['id', 'doujinshi'] - logger.info('Search Result\n' + + logger.info('Search Result || Found %i doujinshis \n' % doujinshi_list.__len__() + tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst')) -def tag_parser(tag_id, max_page=1): - logger.info('Searching for doujinshi with tag id {0}'.format(tag_id)) +def search_parser(keyword, sorting, page, is_page_all=False): + # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')]) result = [] - i=0 - while i<5: - try: - response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() - except Exception as e: - i+=1 - if not i<5: + if not page: + page = [1] + + if is_page_all: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword}).url + init_response = request('get', url.replace('%2B', '+')).json() + page = range(1, init_response['num_pages']+1) + + total = '/{0}'.format(page[-1]) if is_page_all else '' + for p in page: + i = 0 + + logger.info('Searching doujinshis using keywords "{0}" on page {1}{2}'.format(keyword, p, total)) + while i < 3: + try: + url = request('get', url=constant.SEARCH_URL, params={'query': keyword, + 'page': p, 'sort': sorting}).url + response = request('get', url.replace('%2B', '+')).json() + except Exception as e: logger.critical(str(e)) - exit(1) - continue - break - page = max_page if max_page <= response['num_pages'] else int(response['num_pages']) - - for i in range(1, page+1): - logger.info('Getting page {} ...'.format(i)) - - if page != 1: - i=0 - while i<5: - try: - response = request('get', url=constant.TAG_API_URL, params={'sort': 'popular', 'tag_id': tag_id}).json() - except Exception as e: - i+=1 - if not i<5: - logger.critical(str(e)) - exit(1) - continue - break - for row in response['result']: - title = row['title']['english'] - title = title[:85] + '..' if len(title) > 85 else title - result.append({'id': row['id'], 'title': title}) - if not result: - logger.warn('No results for tag id {}'.format(tag_id)) - + break + + if 'result' not in response: + logger.warn('No result in response in page {}'.format(p)) + break + + for row in response['result']: + title = row['title']['english'] + title = title[:85] + '..' if len(title) > 85 else title + result.append({'id': row['id'], 'title': title}) + + if not result: + logger.warn('No results for keywords {}'.format(keyword)) + return result -def tag_guessing(tag_name): - tag_name = tag_name.lower() - tag_name = tag_name.replace(' ', '-') - logger.info('Trying to get tag_id of tag \'{0}\''.format(tag_name)) - i=0 - while i<5: +def __api_suspended_doujinshi_parser(id_): + if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()): + raise Exception('Doujinshi id({0}) is not valid'.format(id_)) + + id_ = int(id_) + logger.log(15, 'Fetching information of doujinshi id {0}'.format(id_)) + doujinshi = dict() + doujinshi['id'] = id_ + url = '{0}/{1}'.format(constant.DETAIL_URL, id_) + i = 0 + while 5 > i: try: - response = request('get', url='%s/%s' % (constant.TAG_URL, tag_name)).content + response = request('get', url).json() except Exception as e: - i+=1 - if not i<5: + i += 1 + if not i < 5: logger.critical(str(e)) exit(1) continue break - html = BeautifulSoup(response, 'html.parser') - first_item = html.find('div', attrs={'class': 'gallery'}) - if not first_item: - logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) - return + doujinshi['name'] = response['title']['english'] + doujinshi['subtitle'] = response['title']['japanese'] + doujinshi['img_id'] = response['media_id'] + doujinshi['ext'] = ''.join([i['t'] for i in response['images']['pages']]) + doujinshi['pages'] = len(response['images']['pages']) - doujinshi_id = re.findall('(\d+)', first_item.a.attrs['href']) - if not doujinshi_id: - logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) - return + # gain information of the doujinshi + needed_fields = ['character', 'artist', 'language', 'tag', 'parody', 'group', 'category'] + for tag in response['tags']: + tag_type = tag['type'] + if tag_type in needed_fields: + if tag_type == 'tag': + if tag_type not in doujinshi: + doujinshi[tag_type] = {} - ret = doujinshi_parser(doujinshi_id[0]) - if 'tag' in ret and tag_name in ret['tag']: - tag_id = ret['tag'][tag_name] - logger.info('Tag id of tag \'{0}\' is {1}'.format(tag_name, tag_id)) - else: - logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name)) - return + tag['name'] = tag['name'].replace(' ', '-') + tag['name'] = tag['name'].lower() + doujinshi[tag_type][tag['name']] = tag['id'] + elif tag_type not in doujinshi: + doujinshi[tag_type] = tag['name'] + else: + doujinshi[tag_type] += ', ' + tag['name'] - return tag_id + return doujinshi if __name__ == '__main__':