nhentai/parser.py

   1 # coding: utf-8
   2 from __future__ import print_function
   3 import sys
   4 import re
   5 import requests
   6 from bs4 import BeautifulSoup
   7 import constant
   8 from logger import logger
   9 from tabulate import tabulate
  10
  11
  12 def request(method, url, **kwargs):
  13     if not hasattr(requests, method):
  14         raise AttributeError('\'requests\' object has no attribute \'{}\''.format(method))
  15
  16     return requests.__dict__[method](url, proxies=constant.PROXY, **kwargs)
  17
  18
  19 def doujinshi_parser(id_):
  20     if not isinstance(id_, (int,)) and (isinstance(id_, (str,)) and not id_.isdigit()):
  21         raise Exception('Doujinshi id({}) is not valid'.format(id_))
  22
  23     id_ = int(id_)
  24     logger.log(15, 'Fetching doujinshi information of id {}'.format(id_))
  25     doujinshi = dict()
  26     doujinshi['id'] = id_
  27     url = '{}/{}/'.format(constant.DETAIL_URL, id_)
  28
  29     try:
  30         response = request('get', url).content
  31     except Exception as e:
  32         logger.critical(str(e))
  33         sys.exit()
  34
  35     html = BeautifulSoup(response)
  36     doujinshi_info = html.find('div', attrs={'id': 'info'})
  37
  38     title = doujinshi_info.find('h1').text
  39     subtitle = doujinshi_info.find('h2')
  40
  41     doujinshi['name'] = title
  42     doujinshi['subtitle'] = subtitle.text if subtitle else ''
  43
  44     doujinshi_cover = html.find('div', attrs={'id': 'cover'})
  45     img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png)$', doujinshi_cover.a.img['src'])
  46     if not img_id:
  47         logger.critical('Tried yo get image id failed')
  48         sys.exit()
  49     doujinshi['img_id'] = img_id.group(1)
  50     doujinshi['ext'] = img_id.group(2)
  51
  52     pages = 0
  53     for _ in doujinshi_info.find_all('div', class_=''):
  54         pages = re.search('([\d]+) pages', _.text)
  55         if pages:
  56             pages = pages.group(1)
  57             break
  58     doujinshi['pages'] = int(pages)
  59
  60     # gain information of the doujinshi
  61     information_fields = doujinshi_info.find_all('div', attrs={'class': 'field-name'})
  62     needed_fields = ['Characters', 'Artists', 'Language', 'Tags']
  63     for field in information_fields:
  64         field_name = field.contents[0].strip().strip(':')
  65         if field_name in needed_fields:
  66             data = [sub_field.contents[0].strip() for sub_field in
  67                     field.find_all('a', attrs={'class': 'tag'})]
  68             doujinshi[field_name.lower()] = ', '.join(data)
  69
  70     return doujinshi
  71
  72
  73 def search_parser(keyword, page):
  74     logger.debug('Searching doujinshis of keyword {}'.format(keyword))
  75     result = []
  76     try:
  77         response = request('get', url=constant.SEARCH_URL, params={'q': keyword, 'page': page}).content
  78     except requests.ConnectionError as e:
  79         logger.critical(e)
  80         logger.warn('If you are in China, please configure the proxy to fu*k GFW.')
  81         raise SystemExit
  82
  83     html = BeautifulSoup(response)
  84     doujinshi_search_result = html.find_all('div', attrs={'class': 'gallery'})
  85     for doujinshi in doujinshi_search_result:
  86         doujinshi_container = doujinshi.find('div', attrs={'class': 'caption'})
  87         title = doujinshi_container.text.strip()
  88         title = (title[:85] + '..') if len(title) > 85 else title
  89         id_ = re.search('/g/(\d+)/', doujinshi.a['href']).group(1)
  90         result.append({'id': id_, 'title': title})
  91     return result
  92
  93
  94 def print_doujinshi(doujinshi_list):
  95     if not doujinshi_list:
  96         return
  97     doujinshi_list = [i.values() for i in doujinshi_list]
  98     headers = ['id', 'doujinshi']
  99     logger.info('Search Result\n' +
 100                 tabulate(tabular_data=doujinshi_list, headers=headers, tablefmt='rst'))
 101
 102 if __name__ == '__main__':
 103     print(doujinshi_parser("32271"))