nhentai/parser.py

   1 # coding: utf-8
   2
   3 import os
   4 import re
   5 import time
   6 from bs4 import BeautifulSoup
   7 from tabulate import tabulate
   8
   9 import nhentai.constant as constant
  10 from nhentai.utils import request
  11 from nhentai.logger import logger
  12
  13
  14 def _get_csrf_token(content):
  15     html = BeautifulSoup(content, 'html.parser')
  16     csrf_token_elem = html.find('input', attrs={'name': 'csrfmiddlewaretoken'})
  17     if not csrf_token_elem:
  18         raise Exception('Cannot find csrf token to login')
  19     return csrf_token_elem.attrs['value']
  20
  21
  22 def login(username, password):
  23     logger.warning('This feature is deprecated, please use --cookie to set your cookie.')
  24     csrf_token = _get_csrf_token(request('get', url=constant.LOGIN_URL).text)
  25     if os.getenv('DEBUG'):
  26         logger.info('Getting CSRF token ...')
  27
  28     if os.getenv('DEBUG'):
  29         logger.info('CSRF token is {}'.format(csrf_token))
  30
  31     login_dict = {
  32         'csrfmiddlewaretoken': csrf_token,
  33         'username_or_email': username,
  34         'password': password,
  35     }
  36     resp = request('post', url=constant.LOGIN_URL, data=login_dict)
  37
  38     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  39         csrf_token = _get_csrf_token(resp.text)
  40         resp = request('post', url=resp.url, data={'csrfmiddlewaretoken': csrf_token, 'next': '/'})
  41
  42     if 'Invalid username/email or password' in resp.text:
  43         logger.error('Login failed, please check your username and password')
  44         exit(1)
  45
  46     if 'You\'re loading pages way too quickly.' in resp.text or 'Really, slow down' in resp.text:
  47         logger.error('Using nhentai --cookie \'YOUR_COOKIE_HERE\' to save your Cookie.')
  48         exit(2)
  49
  50
  51 def _get_title_and_id(response):
  52     result = []
  53     html = BeautifulSoup(response, 'html.parser')
  54     doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'})
  55     for doujinshi in doujinshi_search_result:
  56         doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'})
  57         title = doujinshi_container.text.strip()
  58         title = title if len(title) < 85 else title[:82] + '...'
  59         id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1)
  60         result.append({'id': id_, 'title': title})
  61
  62     return result
  63
  64
  65 def favorites_parser(page=None):
  66     result = []
  67     html = BeautifulSoup(request('get', constant.FAV_URL).content, 'html.parser')
  68     count = html.find('span', attrs={'class': 'count'})
  69     if not count:
  70         logger.error("Can't get your number of favorited doujins. Did the login failed?")
  71         return []
  72
  73     count = int(count.text.strip('(').strip(')').replace(',', ''))
  74     if count == 0:
  75         logger.warning('No favorites found')
  76         return []
  77     pages = int(count / 25)
  78
  79     if page:
  80         page_range_list = page
  81     else:
  82         if pages:
  83             pages += 1 if count % (25 * pages) else 0
  84         else:
  85             pages = 1
  86
  87         logger.info('You have %d favorites in %d pages.' % (count, pages))
  88
  89         if os.getenv('DEBUG'):
  90             pages = 1
  91
  92         page_range_list = range(1, pages + 1)
  93
  94     for page in page_range_list:
  95         try:
  96             logger.info('Getting doujinshi ids of page %d' % page)
  97             resp = request('get', constant.FAV_URL + '?page=%d' % page).content
  98
  99             result.extend(_get_title_and_id(resp))
 100         except Exception as e:
 101             logger.error('Error: %s, continue', str(e))
 102
 103     return result
 104
 105
 106 def doujinshi_parser(id_):
 107     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 108         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 109
 110     id_ = int(id_)
 111     logger.log(15, 'Fetching doujinshi information of id {0}'.format(id_))
 112     doujinshi = dict()
 113     doujinshi['id'] = id_
 114     url = '{0}/{1}/'.format(constant.DETAIL_URL, id_)
 115
 116     try:
 117         response = request('get', url)
 118         if response.status_code in (200, ):
 119             response = response.content
 120         elif response.status_code in (404,):
 121             logger.error("Doujinshi with id {0} cannot be found".format(id_))
 122             return []
 123         else:
 124             logger.debug('Slow down and retry ({}) ...'.format(id_))
 125             time.sleep(1)
 126             return doujinshi_parser(str(id_))
 127
 128     except Exception as e:
 129         logger.warning('Error: {}, ignored'.format(str(e)))
 130         return None
 131
 132     html = BeautifulSoup(response, 'html.parser')
 133     doujinshi_info = html.find('div', attrs={'id': 'info'})
 134
 135     title = doujinshi_info.find('h1').text
 136     pretty_name = doujinshi_info.find('h1').find('span', attrs={'class': 'pretty'}).text
 137     subtitle = doujinshi_info.find('h2')
 138
 139     doujinshi['name'] = title
 140     doujinshi['pretty_name'] = pretty_name
 141     doujinshi['subtitle'] = subtitle.text if subtitle else ''
 142
 143     doujinshi_cover = html.find('div', attrs={'id': 'cover'})
 144     img_id = re.search('/galleries/([0-9]+)/cover.(jpg|png|gif)$',
 145                        doujinshi_cover.a.img.attrs['data-src'])
 146
 147     ext = []
 148     for i in html.find_all('div', attrs={'class': 'thumb-container'}):
 149         _, ext_name = os.path.basename(i.img.attrs['data-src']).rsplit('.', 1)
 150         ext.append(ext_name)
 151
 152     if not img_id:
 153         logger.critical('Tried yo get image id failed')
 154         exit(1)
 155
 156     doujinshi['img_id'] = img_id.group(1)
 157     doujinshi['ext'] = ext
 158
 159     for _ in doujinshi_info.find_all('div', class_='tag-container field-name'):
 160         if re.search('Pages:', _.text):
 161             pages = _.find('span', class_='name').string
 162     doujinshi['pages'] = int(pages)
 163
 164     # gain information of the doujinshi
 165     information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'})
 166     needed_fields = ['Characters', 'Artists', 'Languages', 'Tags', 'Parodies', 'Groups', 'Categories']
 167     for field in information_fields:
 168         field_name = field.contents[0].strip().strip(':')
 169         if field_name in needed_fields:
 170             data = [sub_field.find('span', attrs={'class': 'name'}).contents[0].strip() for sub_field in
 171                     field.find_all('a', attrs={'class': 'tag'})]
 172             doujinshi[field_name.lower()] = ', '.join(data)
 173
 174     time_field = doujinshi_info.find('time')
 175     if time_field.has_attr('datetime'):
 176         doujinshi['date'] = time_field['datetime']
 177     return doujinshi
 178
 179
 180 def old_search_parser(keyword, sorting='date', page=1):
 181     logger.debug('Searching doujinshis of keyword {0}'.format(keyword))
 182     response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page, 'sort': sorting}).content
 183
 184     result = _get_title_and_id(response)
 185     if not result:
 186         logger.warning('Not found anything of keyword {}'.format(keyword))
 187
 188     return result
 189
 190
 191 def print_doujinshi(doujinshi_list):
 192     if not doujinshi_list:
 193         return
 194     doujinshi_list = [(i['id'], i['title']) for i in doujinshi_list]
 195     headers = ['id', 'doujinshi']
 196     logger.info('Search Result || Found %i doujinshis \n' % doujinshi_list.__len__() +
 197                 tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst'))
 198
 199
 200 def search_parser(keyword, sorting, page, is_page_all=False):
 201     # keyword = '+'.join([i.strip().replace(' ', '-').lower() for i in keyword.split(',')])
 202     result = []
 203     response = None
 204     if not page:
 205         page = [1]
 206
 207     if is_page_all:
 208         url = request('get', url=constant.SEARCH_URL, params={'query': keyword}).url
 209         init_response = request('get', url.replace('%2B', '+')).json()
 210         page = range(1, init_response['num_pages']+1)
 211
 212     total = '/{0}'.format(page[-1]) if is_page_all else ''
 213     for p in page:
 214         i = 0
 215
 216         logger.info('Searching doujinshis using keywords "{0}" on page {1}{2}'.format(keyword, p, total))
 217         while i < 3:
 218             try:
 219                 url = request('get', url=constant.SEARCH_URL, params={'query': keyword,
 220                                                                       'page': p, 'sort': sorting}).url
 221                 print(url)
 222                 response = request('get', url.replace('%2B', '+')).json()
 223             except Exception as e:
 224                 logger.critical(str(e))
 225                 response = None
 226             break
 227
 228         if response is None or 'result' not in response:
 229             logger.warning('No result in response in page {}'.format(p))
 230             break
 231
 232         for row in response['result']:
 233             title = row['title']['english']
 234             title = title[:85] + '..' if len(title) > 85 else title
 235             result.append({'id': row['id'], 'title': title})
 236
 237         if not result:
 238             logger.warning('No results for keywords {}'.format(keyword))
 239
 240     return result
 241
 242
 243 def __api_suspended_doujinshi_parser(id_):
 244     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
 245         raise Exception('Doujinshi id({0}) is not valid'.format(id_))
 246
 247     id_ = int(id_)
 248     logger.log(15, 'Fetching information of doujinshi id {0}'.format(id_))
 249     doujinshi = dict()
 250     doujinshi['id'] = id_
 251     url = '{0}/{1}'.format(constant.DETAIL_URL, id_)
 252     i = 0
 253     while 5 > i:
 254         try:
 255             response = request('get', url).json()
 256         except Exception as e:
 257             i += 1
 258             if not i < 5:
 259                 logger.critical(str(e))
 260                 exit(1)
 261             continue
 262         break
 263
 264     doujinshi['name'] = response['title']['english']
 265     doujinshi['subtitle'] = response['title']['japanese']
 266     doujinshi['img_id'] = response['media_id']
 267     doujinshi['ext'] = ''.join([i['t'] for i in response['images']['pages']])
 268     doujinshi['pages'] = len(response['images']['pages'])
 269
 270     # gain information of the doujinshi
 271     needed_fields = ['character', 'artist', 'language', 'tag', 'parody', 'group', 'category']
 272     for tag in response['tags']:
 273         tag_type = tag['type']
 274         if tag_type in needed_fields:
 275             if tag_type == 'tag':
 276                 if tag_type not in doujinshi:
 277                     doujinshi[tag_type] = {}
 278
 279                 tag['name'] = tag['name'].replace(' ', '-')
 280                 tag['name'] = tag['name'].lower()
 281                 doujinshi[tag_type][tag['name']] = tag['id']
 282             elif tag_type not in doujinshi:
 283                 doujinshi[tag_type] = tag['name']
 284             else:
 285                 doujinshi[tag_type] += ', ' + tag['name']
 286
 287     return doujinshi
 288
 289
 290 if __name__ == '__main__':
 291     print(doujinshi_parser("32271"))