nhentai/parser.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals, print_function
   3
   4 import os
   5 import re
   6 import time
   7 from bs4 import BeautifulSoup
   8 from tabulate import tabulate
   9
  10 import nhentai.constant as constant
  11 from nhentai.utils import request
  12 from nhentai.logger import logger
  13
  14
  15 def _get_csrf_token(content):
  16     html = BeautifulSoup(content, 'html.parser')
  17     csrf_token_elem = html.find('input', attrs={'name': 'csrfmiddlewaretoken'})
  18     if not csrf_token_elem:
  19         raise Exception('Cannot find csrf token to login')
  20     return csrf_token_elem.attrs['value']
  21
  22
  23 def login(username, password):
  24     logger.warning('This feature is deprecated, please use --cookie to set your cookie.')
  25     csrf_token = _get_csrf_token(request('get', url=constant.LOGIN_URL).text)
  26     if os.getenv('DEBUG'):
  27         logger.info('Getting CSRF token ...')
  28
  29     if os.getenv('DEBUG'):
  30         logger.info('CSRF token is {}'.format(csrf_token))
  31
  32     login_dict = {
  33         'csrfmiddlewaretoken': csrf_token,
  34         'username_or_email': username,
  35         'password': password,
  36     }
  37     resp = request('post', url=constant.LOGIN_URL, data=login_dict)
  38
  39     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  40         csrf_token = _get_csrf_token(resp.text)
  41         resp = request('post', url=resp.url, data={'csrfmiddlewaretoken': csrf_token, 'next': '/'})
  42
  43     if 'Invalid username/email or password' in resp.text:
  44         logger.error('Login failed, please check your username and password')
  45         exit(1)
  46
  47     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  48         logger.error('Using nhentai --cookie \'YOUR_COOKIE_HERE\' to save your Cookie.')
  49         exit(2)
  50
  51
  52 def _get_title_and_id(response):
  53     result = []
  54     html = BeautifulSoup(response, 'html.parser')
  55     doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'})
  56     for doujinshi in doujinshi_search_result:
  57         doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'})
  58         title = doujinshi_container.text.strip()
  59         title = title if len(title) < 85 else title[:82] + '...'
  60         id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1)
  61         result.append({'id': id_, 'title': title})
  62
  63     return result
  64
  65
  66 def favorites_parser(page=None):
  67     result = []
  68     html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser')
  69     count = html.find('span', attrs={'class': 'count'})
  70     if not count:
  71         logger.error("Can't get your number of favorited doujins. Did the login failed?")
  72         return []
  73
  74     count = int(count.text.strip('(').strip(')').replace(',', ''))
  75     if count == 0:
  76         logger.warning('No favorites found')
  77         return []
  78     pages = int(count / 25)
  79
  80     if page:
  81         page_range_list = page
  82     else:
  83         if pages:
  84             pages += 1 if count % (25 * pages) else 0
  85         else:
  86             pages = 1
  87
  88         logger.info('You have %d favorites in %d pages.' % (count, pages))
  89
  90         if os.getenv('DEBUG'):
  91             pages = 1
  92
  93         page_range_list = range(1, pages + 1)
  94
  95     for page in page_range_list:
  96         try:
  97             logger.info('Getting doujinshi ids of page %d' % page)
  98             resp = request('get', constant.FAV_URL + '?page=%d' % page).content
  99
 100             result.extend(_get_title_and_id(resp))
 101         except Exception as e:
 102             logger.error('Error: %s, continue', str(e))
 103
 104     return result
 105
 106
 107 def doujinshi_parser(id_):
 108     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 109         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 110
 111     id_ = int(id_)
 112     logger.log(15, 'Fetching doujinshi information of id {0}'.format(id_))
 113     doujinshi = dict()
 114     doujinshi['id'] = id_
 115     url = '{0}/{1}/'.format(constant.DETAIL_URL, id_)
 116
 117     try:
 118         response = request('get', url)
 119         if response.status_code in (200, ):
 120             response = response.content
 121         elif response.status_code in (404,):
 122             logger.error("Doujinshi with id {0} cannot be found".format(id_))
 123             return []
 124         else:
 125             logger.debug('Slow down and retry ({}) ...'.format(id_))
 126             time.sleep(1)
 127             return doujinshi_parser(str(id_))
 128
 129     except Exception as e:
 130         logger.warn('Error: {}, ignored'.format(str(e)))
 131         return None
 132
 133     html = BeautifulSoup(response, 'html.parser')
 134     doujinshi_info = html.find('div', attrs={'id': 'info'})
 135
 136     title = doujinshi_info.find('h1').text
 137     subtitle = doujinshi_info.find('h2')
 138
 139     doujinshi['name'] = title
 140     doujinshi['subtitle'] = subtitle.text if subtitle else ''
 141
 142     doujinshi_cover = html.find('div', attrs={'id': 'cover'})
 143     img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png|gif)$', doujinshi_cover.a.img.attrs['data-src'])
 144
 145     ext = []
 146     for i in html.find_all('div', attrs={'class': 'thumb-container'}):
 147         _, ext_name = os.path.basename(i.img.attrs['data-src']).rsplit('.', 1)
 148         ext.append(ext_name)
 149
 150     if not img_id:
 151         logger.critical('Tried yo get image id failed')
 152         exit(1)
 153
 154     doujinshi['img_id'] = img_id.group(1)
 155     doujinshi['ext'] = ext
 156
 157     for _ in doujinshi_info.find_all('div', class_='tag-container field-name'):
 158         if re.search('Pages:', _.text):
 159             pages = _.find('span', class_='name').string
 160     doujinshi['pages'] = int(pages)
 161
 162     # gain information of the doujinshi
 163     information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'})
 164     needed_fields = ['Characters', 'Artists', 'Languages', 'Tags', 'Parodies', 'Groups', 'Categories']
 165     for field in information_fields:
 166         field_name = field.contents[0].strip().strip(':')
 167         if field_name in needed_fields:
 168             data = [sub_field.find('span', attrs={'class': 'name'}).contents[0].strip() for sub_field in
 169                     field.find_all('a', attrs={'class': 'tag'})]
 170             doujinshi[field_name.lower()] = ', '.join(data)
 171
 172     time_field = doujinshi_info.find('time')
 173     if time_field.has_attr('datetime'):
 174         doujinshi['date'] = time_field['datetime']
 175     return doujinshi
 176
 177
 178 def old_search_parser(keyword, sorting='date', page=1):
 179     logger.debug('Searching doujinshis of keyword {0}'.format(keyword))
 180     response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content
 181
 182     result = _get_title_and_id(response)
 183     if not result:
 184         logger.warn('Not found anything of keyword {}'.format(keyword))
 185
 186     return result
 187
 188
 189 def print_doujinshi(doujinshi_list):
 190     if not doujinshi_list:
 191         return
 192     doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list]
 193     headers = ['id', 'doujinshi']
 194     logger.info('Search Result || Found %i doujinshis \n' % doujinshi_list.__len__() +
 195                 tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst'))
 196
 197
 198 def search_parser(keyword, sorting, page, is_page_all=False):
 199     # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')])
 200     result = []
 201     if not page:
 202         page = [1]
 203
 204     if is_page_all:
 205         url = request('get', url=constant.SEARCH_URL, params={'query': keyword}).url
 206         init_response = request('get', url.replace('%2B', '+')).json()
 207         page = range(1, init_response['num_pages']+1)
 208
 209     total = '/{0}'.format(page[-1]) if is_page_all else ''
 210     for p in page:
 211         i = 0
 212
 213         logger.info('Searching doujinshis using keywords "{0}" on page {1}{2}'.format(keyword, p, total))
 214         while i < 3:
 215             try:
 216                 url = request('get', url=constant.SEARCH_URL, params={'query': keyword,
 217                                                                       'page': p, 'sort': sorting}).url
 218                 response = request('get', url.replace('%2B', '+')).json()
 219             except Exception as e:
 220                 logger.critical(str(e))
 221
 222             break
 223
 224         if 'result' not in response:
 225             logger.warn('No result in response in page {}'.format(p))
 226             break
 227
 228         for row in response['result']:
 229             title = row['title']['english']
 230             title = title[:85] + '..' if len(title) > 85 else title
 231             result.append({'id': row['id'], 'title': title})
 232
 233         if not result:
 234             logger.warn('No results for keywords {}'.format(keyword))
 235
 236     return result
 237
 238
 239 def __api_suspended_doujinshi_parser(id_):
 240     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 241         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 242
 243     id_ = int(id_)
 244     logger.log(15, 'Fetching information of doujinshi id {0}'.format(id_))
 245     doujinshi = dict()
 246     doujinshi['id'] = id_
 247     url = '{0}/{1}'.format(constant.DETAIL_URL, id_)
 248     i = 0
 249     while 5 > i:
 250         try:
 251             response = request('get', url).json()
 252         except Exception as e:
 253             i += 1
 254             if not i < 5:
 255                 logger.critical(str(e))
 256                 exit(1)
 257             continue
 258         break
 259
 260     doujinshi['name'] = response['title']['english']
 261     doujinshi['subtitle'] = response['title']['japanese']
 262     doujinshi['img_id'] = response['media_id']
 263     doujinshi['ext'] = ''.join([i['t'] for i in response['images']['pages']])
 264     doujinshi['pages'] = len(response['images']['pages'])
 265
 266     # gain information of the doujinshi
 267     needed_fields = ['character', 'artist', 'language', 'tag', 'parody', 'group', 'category']
 268     for tag in response['tags']:
 269         tag_type = tag['type']
 270         if tag_type in needed_fields:
 271             if tag_type == 'tag':
 272                 if tag_type not in doujinshi:
 273                     doujinshi[tag_type] = {}
 274
 275                 tag['name'] = tag['name'].replace(' ', '-')
 276                 tag['name'] = tag['name'].lower()
 277                 doujinshi[tag_type][tag['name']] = tag['id']
 278             elif tag_type not in doujinshi:
 279                 doujinshi[tag_type] = tag['name']
 280             else:
 281                 doujinshi[tag_type] += ', ' + tag['name']
 282
 283     return doujinshi
 284
 285
 286 if __name__ == '__main__':
 287     print(doujinshi_parser("32271"))