nhentai/parser.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals, print_function
   3
   4 import os
   5 import re
   6 import threadpool
   7 import requests
   8 import time
   9 from bs4 import BeautifulSoup
  10 from tabulate import tabulate
  11
  12 import nhentai.constant as constant
  13 from nhentai.utils import request
  14 from nhentai.logger import logger
  15
  16
  17 def _get_csrf_token(content):
  18     html = BeautifulSoup(content, 'html.parser')
  19     csrf_token_elem = html.find('input', attrs={'name': 'csrfmiddlewaretoken'})
  20     if not csrf_token_elem:
  21         raise Exception('Cannot find csrf token to login')
  22     return csrf_token_elem.attrs['value']
  23
  24
  25 def login(username, password):
  26     logger.warning('This feature is deprecated, please use --cookie to set your cookie.')
  27     csrf_token = _get_csrf_token(request('get', url=constant.LOGIN_URL).text)
  28     if os.getenv('DEBUG'):
  29         logger.info('Getting CSRF token ...')
  30
  31     if os.getenv('DEBUG'):
  32         logger.info('CSRF token is {}'.format(csrf_token))
  33
  34     login_dict = {
  35         'csrfmiddlewaretoken': csrf_token,
  36         'username_or_email': username,
  37         'password': password,
  38     }
  39     resp = request('post', url=constant.LOGIN_URL, data=login_dict)
  40
  41     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  42         csrf_token = _get_csrf_token(resp.text)
  43         resp = request('post', url=resp.url, data={'csrfmiddlewaretoken': csrf_token, 'next': '/'})
  44
  45     if 'Invalid username/email or password' in resp.text:
  46         logger.error('Login failed, please check your username and password')
  47         exit(1)
  48
  49     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  50         logger.error('Using nhentai --cookie \'YOUR_COOKIE_HERE\' to save your Cookie.')
  51         exit(2)
  52
  53
  54 def _get_title_and_id(response):
  55     result = []
  56     html = BeautifulSoup(response, 'html.parser')
  57     doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'})
  58     for doujinshi in doujinshi_search_result:
  59         doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'})
  60         title = doujinshi_container.text.strip()
  61         title = title if len(title) < 85 else title[:82] + '...'
  62         id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1)
  63         result.append({'id': id_, 'title': title})
  64
  65     return result
  66
  67
  68 def favorites_parser():
  69     result = []
  70     html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser')
  71     count = html.find('span', attrs={'class': 'count'})
  72     if not count:
  73         logger.error("Can't get your number of favorited doujins. Did the login failed?")
  74         return []
  75
  76     count = int(count.text.strip('(').strip(')').replace(',', ''))
  77     if count == 0:
  78         logger.warning('No favorites found')
  79         return []
  80     pages = int(count / 25)
  81
  82     if pages:
  83         pages += 1 if count % (25 * pages) else 0
  84     else:
  85         pages = 1
  86
  87     logger.info('You have %d favorites in %d pages.' % (count, pages))
  88
  89     if os.getenv('DEBUG'):
  90         pages = 1
  91
  92     for page in range(1, pages + 1):
  93         try:
  94             logger.info('Getting doujinshi ids of page %d' % page)
  95             resp = request('get', constant.FAV_URL + '?page=%d' % page).content
  96
  97             result.extend(_get_title_and_id(resp))
  98         except Exception as e:
  99             logger.error('Error: %s, continue', str(e))
 100
 101     return result
 102
 103
 104 def doujinshi_parser(id_):
 105     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 106         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 107
 108     id_ = int(id_)
 109     logger.log(15, 'Fetching doujinshi information of id {0}'.format(id_))
 110     doujinshi = dict()
 111     doujinshi['id'] = id_
 112     url = '{0}/{1}/'.format(constant.DETAIL_URL, id_)
 113
 114     try:
 115         response = request('get', url)
 116         if response.status_code in (200, ):
 117             response = response.content
 118         else:
 119             logger.debug('Slow down and retry ({}) ...'.format(id_))
 120             time.sleep(1)
 121             return doujinshi_parser(str(id_))
 122
 123     except Exception as e:
 124         logger.critical(str(e))
 125         raise SystemExit
 126
 127     html = BeautifulSoup(response, 'html.parser')
 128     doujinshi_info = html.find('div', attrs={'id': 'info'})
 129
 130     title = doujinshi_info.find('h1').text
 131     subtitle = doujinshi_info.find('h2')
 132
 133     doujinshi['name'] = title
 134     doujinshi['subtitle'] = subtitle.text if subtitle else ''
 135
 136     doujinshi_cover = html.find('div', attrs={'id': 'cover'})
 137     img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png)$', doujinshi_cover.a.img.attrs['data-src'])
 138
 139     ext = []
 140     for i in html.find_all('div', attrs={'class': 'thumb-container'}):
 141         _, ext_name = os.path.basename(i.img.attrs['data-src']).rsplit('.', 1)
 142         ext.append(ext_name)
 143
 144     if not img_id:
 145         logger.critical('Tried yo get image id failed')
 146         exit(1)
 147
 148     doujinshi['img_id'] = img_id.group(1)
 149     doujinshi['ext'] = ext
 150
 151     pages = 0
 152     for _ in doujinshi_info.find_all('div', class_=''):
 153         pages = re.search('([\d]+) pages', _.text)
 154         if pages:
 155             pages = pages.group(1)
 156             break
 157     doujinshi['pages'] = int(pages)
 158
 159     # gain information of the doujinshi
 160     information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'})
 161     needed_fields = ['Characters', 'Artists', 'Languages', 'Tags']
 162     for field in information_fields:
 163         field_name = field.contents[0].strip().strip(':')
 164         if field_name in needed_fields:
 165             data = [sub_field.contents[0].strip() for sub_field in
 166                     field.find_all('a', attrs={'class': 'tag'})]
 167             doujinshi[field_name.lower()] = ', '.join(data)
 168
 169     return doujinshi
 170
 171
 172 def search_parser(keyword, sorting, page):
 173     logger.debug('Searching doujinshis of keyword {0}'.format(keyword))
 174     try:
 175         response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content
 176     except requests.ConnectionError as e:
 177         logger.critical(e)
 178         logger.warn('If you are in China, please configure the proxy to fu*k GFW.')
 179         raise SystemExit
 180
 181     result = _get_title_and_id(response)
 182     if not result:
 183         logger.warn('Not found anything of keyword {}'.format(keyword))
 184
 185     return result
 186
 187
 188 def print_doujinshi(doujinshi_list):
 189     if not doujinshi_list:
 190         return
 191     doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list]
 192     headers = ['id', 'doujinshi']
 193     logger.info('Search Result\n' +
 194                 tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst'))
 195
 196
 197 def tag_parser(tag_name, sorting, max_page=1):
 198     result = []
 199     tag_name = tag_name.lower()
 200     tag_name = tag_name.replace(' ', '-')
 201
 202     if sorting == 'date':
 203         sorting = ''
 204
 205     for p in range(1, max_page + 1):
 206         logger.debug('Fetching page {0} for doujinshi with tag \'{1}\''.format(p, tag_name))
 207         response = request('get', url='%s/%s/%s?page=%d' % (constant.TAG_URL, tag_name, sorting, p)).content
 208
 209         result += _get_title_and_id(response)
 210         if not result:
 211             logger.error('Cannot find doujinshi id of tag \'{0}\''.format(tag_name))
 212             return
 213
 214     if not result:
 215         logger.warn('No results for tag \'{}\''.format(tag_name))
 216
 217     return result
 218
 219
 220 def __api_suspended_search_parser(keyword, sorting, page):
 221     logger.debug('Searching doujinshis using keywords {0}'.format(keyword))
 222     result = []
 223     i = 0
 224     while i < 5:
 225         try:
 226             response = request('get', url=constant.SEARCH_URL, params={'query': keyword, 'page': page, 'sort': sorting}).json()
 227         except Exception as e:
 228             i += 1
 229             if not i < 5:
 230                 logger.critical(str(e))
 231                 logger.warn('If you are in China, please configure the proxy to fu*k GFW.')
 232                 exit(1)
 233             continue
 234         break
 235
 236     if 'result' not in response:
 237         raise Exception('No result in response')
 238
 239     for row in response['result']:
 240         title = row['title']['english']
 241         title = title[:85] + '..' if len(title) > 85 else title
 242         result.append({'id': row['id'], 'title': title})
 243
 244     if not result:
 245         logger.warn('No results for keywords {}'.format(keyword))
 246
 247     return result
 248
 249
 250 def __api_suspended_tag_parser(tag_id, sorting, max_page=1):
 251     logger.info('Searching for doujinshi with tag id {0}'.format(tag_id))
 252     result = []
 253     response = request('get', url=constant.TAG_API_URL, params={'sort': sorting, 'tag_id': tag_id}).json()
 254     page = max_page if max_page <= response['num_pages'] else int(response['num_pages'])
 255
 256     for i in range(1, page + 1):
 257         logger.info('Getting page {} ...'.format(i))
 258
 259         if page != 1:
 260             response = request('get', url=constant.TAG_API_URL,
 261                                params={'sort': sorting, 'tag_id': tag_id}).json()
 262     for row in response['result']:
 263         title = row['title']['english']
 264         title = title[:85] + '..' if len(title) > 85 else title
 265         result.append({'id': row['id'], 'title': title})
 266
 267     if not result:
 268         logger.warn('No results for tag id {}'.format(tag_id))
 269
 270     return result
 271
 272
 273 def __api_suspended_doujinshi_parser(id_):
 274     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 275         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 276
 277     id_ = int(id_)
 278     logger.log(15, 'Fetching information of doujinshi id {0}'.format(id_))
 279     doujinshi = dict()
 280     doujinshi['id'] = id_
 281     url = '{0}/{1}'.format(constant.DETAIL_URL, id_)
 282     i = 0
 283     while 5 > i:
 284         try:
 285             response = request('get', url).json()
 286         except Exception as e:
 287             i += 1
 288             if not i < 5:
 289                 logger.critical(str(e))
 290                 exit(1)
 291             continue
 292         break
 293
 294     doujinshi['name'] = response['title']['english']
 295     doujinshi['subtitle'] = response['title']['japanese']
 296     doujinshi['img_id'] = response['media_id']
 297     doujinshi['ext'] = ''.join(map(lambda s: s['t'], response['images']['pages']))
 298     doujinshi['pages'] = len(response['images']['pages'])
 299
 300     # gain information of the doujinshi
 301     needed_fields = ['character', 'artist', 'language', 'tag']
 302     for tag in response['tags']:
 303         tag_type = tag['type']
 304         if tag_type in needed_fields:
 305             if tag_type == 'tag':
 306                 if tag_type not in doujinshi:
 307                     doujinshi[tag_type] = {}
 308
 309                 tag['name'] = tag['name'].replace(' ', '-')
 310                 tag['name'] = tag['name'].lower()
 311                 doujinshi[tag_type][tag['name']] = tag['id']
 312             elif tag_type not in doujinshi:
 313                 doujinshi[tag_type] = tag['name']
 314             else:
 315                 doujinshi[tag_type] += ', ' + tag['name']
 316
 317     return doujinshi
 318
 319
 320 if __name__ == '__main__':
 321     print(doujinshi_parser("32271"))