nhentai/parser.py

   1 from __future__ import print_function
   2 import sys
   3 import re
   4 import requests
   5 from bs4 import BeautifulSoup
   6 from constant import DETAIL_URL, SEARCH_URL
   7 from logger import logger
   8 from tabulate import tabulate
   9
  10
  11 def dojinshi_parser(id):
  12     if not isinstance(id, (int, )) and (isinstance(id, (str, )) and not id.isdigit()):
  13         raise Exception('Dojinshi id(%s) is not valid' % str(id))
  14     id = int(id)
  15     logger.debug('Fetching dojinshi information of id %d' % id)
  16     dojinshi = dict()
  17     dojinshi['id'] = id
  18     url = '%s/%d/' % (DETAIL_URL, id)
  19
  20     try:
  21         response = requests.get(url).content
  22     except Exception as e:
  23         logger.critical('%s%s' % tuple(e.message))
  24         sys.exit()
  25
  26     html = BeautifulSoup(response)
  27     dojinshi_info = html.find('div', attrs={'id': 'info'})
  28
  29     title = dojinshi_info.find('h1').text
  30     subtitle = dojinshi_info.find('h2')
  31
  32     dojinshi['name'] = title
  33     dojinshi['subtitle'] = subtitle.text if subtitle else ''
  34
  35     dojinshi_cover = html.find('div', attrs={'id': 'cover'})
  36     img_id = re.search('/galleries/([\d]+)/cover\.(jpg|png)$', dojinshi_cover.a.img['src'])
  37     if not img_id:
  38         logger.critical('Tried yo get image id failed')
  39         sys.exit()
  40     dojinshi['img_id'] = img_id.group(1)
  41     dojinshi['ext'] = img_id.group(2)
  42
  43     pages = 0
  44     for _ in dojinshi_info.find_all('div', class_=''):
  45         pages = re.search('([\d]+) pages', _.text)
  46         if pages:
  47             pages = pages.group(1)
  48             break
  49     dojinshi['pages'] = int(pages)
  50     return dojinshi
  51
  52
  53 def search_parser(keyword, page):
  54     logger.debug('Searching dojinshis of keyword %s' % keyword)
  55     result = []
  56     response = requests.get(SEARCH_URL, params={'q': keyword, 'page': page}).content
  57     html = BeautifulSoup(response)
  58     dojinshi_search_result = html.find_all('div', attrs={'class': 'gallery'})
  59     for dojinshi in dojinshi_search_result:
  60         dojinshi_container = dojinshi.find('div', attrs={'class': 'caption'})
  61         title = dojinshi_container.text.strip()
  62         title = (title[:85] + '..') if len(title) > 85 else title
  63         id_ = re.search('/g/(\d+)/', dojinshi.a['href']).group(1)
  64         result.append({'id': id_, 'title': title})
  65     return result
  66
  67
  68 def print_dojinshi(dojinshi_list):
  69     if not dojinshi_list:
  70         return
  71     dojinshi_list = [i.values() for i in dojinshi_list]
  72     headers = ['id', 'dojinshi']
  73     logger.info('Search Result\n' +
  74                 tabulate(tabular_data=dojinshi_list, headers=headers, tablefmt='rst'))
  75
  76 if __name__ == '__main__':
  77     print(dojinshi_parser("32271"))