I just finished script for downloading manga from nhentai.net
I made is as way to learn python a bit, so it's nothing great, and probably has bugs.
for now it seems to work fine though.
import re, sys, urllib.request, os, requests, shutil
from bs4 import BeautifulSoup
USER_AGENT = "Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0"
def fillZeros(num, digs):
tmp = str(num)
c=0
while num>=1:
c+=1
num/=10
while c<digs:
tmp='0'+tmp
c+=1
return tmp
def clean(title):
return re.sub('[\/\\\*><]', '', title)
def download(gallery, name, title):
title = clean(title)
if os.path.exists(title):
j=1
while os.path.exists(title+'_'+str(j)):
j+=1
title+='_'+str(j)
os.makedirs(title)
i = 1
cont = True
while cont:
r = requests.get(gallery+str(i)+'.jpg', stream=True, headers={'User-agent': USER_AGENT})
if r.status_code == 200:
#print('Getting page '+str(i))
with open(title+'/'+name+'_'+fillZeros(i,3)+'.jpg', 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw,f)
i+=1
else:
r = requests.get(gallery+str(i)+'.png', stream=True, headers={'User-agent': USER_AGENT})
if r.status_code == 200:
print('Getting page '+str(i))
with open(title+'/'+name+'_'+fillZeros(i,3)+'.png', 'wb') as f:
r.raw.decode_content = True
shutil.copyfileobj(r.raw,f)
i+=1
else:
cont = False
print(title+' done: '+str(i-1)+' pages')
def getPages(url):
print('Getting form '+url)
request = urllib.request.Request(url)
request.add_header('User-Agent', USER_AGENT)
html = urllib.request.urlopen(request).read()
html=html.decode()
soup=BeautifulSoup(html,'html.parser')
info = soup.find('div', id="info")
info = info.find('h1')
title = info.string
div = soup.find('div',id="cover")
m = re.search('src="(.*galleries/([0-9]*)/)', str(div))
gallery = 'http:'+m.group(1)
gallery = gallery.replace('t.nhentai', 'i.nhentai')
num = m.group(2)
download(gallery, num, title)
if len(sys.argv)<2:
urls = raw_input('url: ')
urls = re.split('\s', urls)
for url in urls:
getPages(url)
else:
i=1
while i<len(sys.argv):
url = sys.argv[i]
i+=1
getPages(url)
Usage:
python scriptname.py mangaurl [mangaurl2 ...]
If someone wants to use it but it doesn't work, let me know, I'll try to fix it.
Edit: Fixed few things