ulthar.xyz > Repos

audible-dl

An archiving tool for Audible audiobook libraries

About Files Commits


          git clone https://ulthar.xyz/repos/audible-dl/audible-dl.git

audible-dl/account.go

Download raw file: account.go

package main

import (
	"bytes"
	"encoding/json"
	"errors"
	"fmt"
	"golang.org/x/net/html"
	"io"
	"io/ioutil"
	"log"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"os"
	"os/exec"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
)

////////////////////////////////////////////////////////////////////////
//                                  _           _     _           _
//   __ _  ___ ___ ___  _   _ _ __ | |_    ___ | |__ (_) ___  ___| |_
//  / _` |/ __/ __/ _ \| | | | '_ \| __|  / _ \| '_ \| |/ _ \/ __| __|
// | (_| | (_| (_| (_) | |_| | | | | |_  | (_) | |_) | |  __/ (__| |_
//  \__,_|\___\___\___/ \__,_|_| |_|\__|  \___/|_.__// |\___|\___|\__|
//                                                 |__/
////////////////////////////////////////////////////////////////////////

// The client has a slice of these, each of which is unmartialed from
// the list of accounts in the the .yml config file.
type Account struct {
	Name   string
	Bytes  string
	Auth   []*http.Cookie
	Scrape bool
	LogBuf bytes.Buffer
}

// Return a string representation of the account for debugging
// purposes.
func (a *Account) String() string {
	ret := "Account:\n"
	ret += "  Name:   " + a.Name + "\n"
	ret += "  Bytes:  " + a.Bytes + "\n"
	ret += "  Scrape: " + strconv.FormatBool(a.Scrape) + "\n"
	ret += "  Auth:\n"
	for _, c := range a.Auth {
		ret += "    " + c.Name + ": " + c.Value + "\n"
	}
	return ret
}

// Flush the contents of a.Log to stderr in order to make it easier to
// debug the scraper's progress.
func (a *Account) PrintScraperDebuggingInfo() {
	fmt.Fprintln(os.Stderr, a.LogBuf.String())
}

// Log the scraper's debugging info to the internal buffer to be
// printed by the above method.
func (a *Account) Log(str string, args ...any) {
	line := a.Name + ": " + fmt.Sprintf(str, args...) + "\n"
	a.LogBuf.WriteString(line)
	if logFile != nil {
		_, err := logFile.WriteString(line)
		unwrap(err)
	}
}

// Parse the contents of a .har archive passed in RAW into a slice of
// cookies that can be passed to a HTTP GET request.
func (a *Account) ImportCookiesFromHAR(raw []byte) {
	var har map[string]interface{}

	unwrap(json.Unmarshal(raw, &har))

	cookies := har["log"].(map[string]interface{})["entries"].([]interface{})[0].(map[string]interface{})["request"].(map[string]interface{})["cookies"].([]interface{})

	for _, c := range cookies {
		value := c.(map[string]interface{})["value"].(string)
		// The values of some non-essential cookies contain a double
		// quote character which net/http really doesn't like
		if strings.Contains(value, "\"") {
			continue
		}
		a.Auth = append(a.Auth, &http.Cookie{
			Name:  c.(map[string]interface{})["name"].(string),
			Value: value,
		})
	}
}

// Shell-out to ffmpeg in order to convert the .aax file in IN to the
// .m4b file in OUT using this account's activation bytes.  On error,
// return ffmpeg's output.
func (a *Account) Convert(in, out string, client *Client) (error, []byte) {
	tmp := client.TempDir + filepath.Base(out)
	cmd := exec.Command("ffmpeg",
		"-activation_bytes", a.Bytes,
		"-i", in,
		"-c", "copy",
		tmp)
	cmd.Stdout = nil
	stderr, _ := cmd.StderrPipe()
	err := cmd.Start()
	slurp, _ := io.ReadAll(stderr)
	if err != nil {
		return err, slurp
	}
	if err = cmd.Wait(); err != nil {
		return err, slurp
	}
	if err = os.Rename(tmp, out); err != nil {
		return err, nil
	}
	return nil, nil
}

// Download a HTMl page in the user's library
func (a *Account) getLibraryPage(page int) ([]byte, error) {
	jar, _ := cookiejar.New(nil)
	client := &http.Client{Jar: jar}
	uri := "https://www.audible.com/library/titles?page=" + strconv.Itoa(page)
	req, _ := http.NewRequest("GET", uri, nil)

	jaruri, _ := url.ParseRequestURI(uri)
	jar.SetCookies(jaruri, a.Auth)

	a.Log("Fetching library page %d", page)

	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}

	if resp.StatusCode != 200 {
		return nil, errors.New("getLibraryPage: " + resp.Status)
	}

	html, _ := ioutil.ReadAll(resp.Body)
	return html, nil
}

// Fill in the structure for a single book
func (a *Account) xSingleBook(dom *html.Tokenizer, tt html.TokenType, tok html.Token) Book {
	var book Book

	// First we'll extract the book's slug from its div's id
	slug := id(tok)
	book.Slug = slug[len(slug)-10:]

	a.Log("Extracting a single book...")

	for {
		tt = dom.Next()
		tok = dom.Token()

		if strings.Contains(class(tok), "bc-image-inset-border") {
			for _, a := range tok.Attr {
				if a.Key == "src" {
					book.CoverURL = cleanstr(a.Val)
				}
			}
			a.Log("Found cover image URL: %s", book.CoverURL)
		} else if class(tok) == "bc-text bc-size-headline3" {
			tt = dom.Next()
			tok = dom.Token()
			book.Title = cleanstr(tok.Data)
			a.Log("Found book title: %s", book.Title)
			continue
		} else if strings.Contains(class(tok), "authorLabel") {
			book.Authors = xPeople(dom)
			a.Log("Found book author(s): %s", book.Authors)
			continue
		} else if strings.Contains(class(tok), "narratorLabel") {
			book.Narrators = xPeople(dom)
			a.Log("Found book narrator(s): %s", book.Narrators)
			continue
		} else if strings.Contains(class(tok), "merchandisingSummary") {
			book.Summary = xSummary(dom, tt, tok)
			a.Log("Found book summary: %s...", book.Summary[:10])
			continue
		} else if id(tok) == "time-remaining-display-"+book.Slug {
			for !(tt == html.EndTagToken && tok.Data == "span") {
				tt = dom.Next()
				tok = dom.Token()
				if tt == html.TextToken {
					book.Runtime = cleanstr(tok.Data)
				}
			}
			a.Log("Found book runtime: %s", book.Runtime)
			continue
		} else if strings.Contains(href(tok), "/series/") {
			book.Series, book.SeriesIndex = xSeries(dom, tt, tok)
			continue
			a.Log("Found book series and index: %s, %d",
				book.Series, book.SeriesIndex)
		} else if href(tok) == "/companion-file/"+book.Slug {
			book.CompanionURL = "https://audible.com" + cleanstr(href(tok))
			a.Log("Found book companion UR: %s", book.CompanionURL)
			continue
		}

		// We've arrived at the next boo
		if strings.Contains(class(tok), "library-item-divider") ||
			id(tok) == "adbl-library-content-toast-messaging" {
			a.Log("Breaking to next book")
			break
		}
	}
	book.DownloadURL = "https://www.audible.com/library/download?asin=" +
		book.Slug + "&codec=AAX"
	book.FileName = stripstr(book.Title)
	return book
}

// Scrape the library until we encounter a book whose filename
// (display title ran through stripstr) matches lim, returning a slice
// of books.  If lim is an empty string this behaves exactly like
// ScrapeFullLibrary().
func (a *Account) ScrapeLibraryUntil(pagenum chan int, lim string) ([]Book, error) {
	var books []Book

	defer close(pagenum)

	// audible.com/library/titles?page=N doesn't return a 404 when
	// we access pages that don't exist, so we'll store the slug of
	// the first books of the current and previous pages in these
	// respective variables and check against the current book's slug.
	var firstincurrpage string = ""
	var firstinprevpage string = ""

	for i := 1; ; i++ {
		pagenum <- i
		raw, err := a.getLibraryPage(i)
		if err != nil {
			return nil, err
		}

		a.Log("Tokenizing page %d", i)

		dom := html.NewTokenizer(bytes.NewReader(raw))

		for {
			tt := dom.Next()
			tok := dom.Token()
			if tokBeginsBook(tt, tok) {
				a.Log("Found a book row")
				// If we find a book, extract it
				book := a.xSingleBook(dom, tt, tok)
				if book.Slug == firstinprevpage {
					a.Log("Reached a duplicate page")
					return books, nil
				}
				if book.FileName == lim && lim != "" {
					a.Log("Reached the final book")
					return books, nil
				}
				books = append(books, book)
				if firstincurrpage == "" {
					// Save the first book in the page
					firstincurrpage = book.Slug
				}
				if firstinprevpage == "" {
					// This is the first page
					firstinprevpage = book.Slug
				}
				continue
			}

			// exit inner loop when we reach the end end
			if id(tok) == "center-6" || tt == html.ErrorToken {
				break
			}
		}
		// We're fetching the next page, so we cycle these out
		firstinprevpage = firstincurrpage
		firstincurrpage = ""

		// If we didn't extract any books from the first page then
		// we probably won't extract any from the next, so we should
		// just break and return an error, saving the page source in
		// a file along with debugging information
		if len(books) == 0 {
			ioutil.WriteFile(".audible-dl-debug.html", raw, 0644)
			return nil, errors.New("Failed to extract books from HTML")
		}
	}
}

// Return a slice of all the books in the user's library.
func (a *Account) ScrapeFullLibrary(pagenum chan int) ([]Book, error) {
	return a.ScrapeLibraryUntil(pagenum, "")
}

// Download a single .aax file from Audible's website using the URL
// discovered by the scraper.  The file is downloaded to a .aax file
// in the temp directory, with an intermediate .part while
// downloading.  The path to the aax is returned in order to be passed
// to the converter.
func (a *Account) DownloadSingleBook(client *Client, book Book) string {
	aax := client.TempDir + book.FileName + ".aax"
	out, err := os.Create(aax + ".part")
	unwrap(err)

	jar, _ := cookiejar.New(nil)
	httpcl := &http.Client{Jar: jar}
	req, _ := http.NewRequest("GET", book.DownloadURL, nil)

	jaruri, _ := url.ParseRequestURI(book.DownloadURL)
	jar.SetCookies(jaruri, a.Auth)

	resp, err := httpcl.Do(req)
	unwrap(err)
	if resp.StatusCode != http.StatusOK {
		log.Fatal("Request returned " + resp.Status)
	}

	nbytes, err := io.Copy(out, resp.Body)
	unwrap(err)
	if nbytes != resp.ContentLength {
		log.Fatal("Failed to write file to disk")
	}

	unwrap(os.Rename(aax+".part", aax))
	return aax
}

////////////////////////////////////////////////////////////////////////
//                                             _   _ _ _ _
//  ___  ___ _ __ __ _ _ __   ___ _ __   _   _| |_(_) (_) |_ ___  ___
// / __|/ __| '__/ _` | '_ \ / _ \ '__| | | | | __| | | | __/ _ \/ __|
// \__ \ (__| | | (_| | |_) |  __/ |    | |_| | |_| | | | ||  __/\__ \
// |___/\___|_|  \__,_| .__/ \___|_|     \__,_|\__|_|_|_|\__\___||___/
//                    |_|
////////////////////////////////////////////////////////////////////////

// Determine if the current html token contains a book
func tokBeginsBook(tt html.TokenType, tok html.Token) bool {
	return tt == html.StartTagToken &&
		class(tok) == "adbl-library-content-row"
}

// Remove whitespace and other shell reserve characters from S
func stripstr(s string) string {
	r := regexp.MustCompile(
		`\s|\\|\(|\)|\[|\]|\{|\}|\*|\?|\!|\+|\,|\;|\&|\||\'|\"|‘`)
	s = r.ReplaceAllString(s, "")

	return s
}

// Remove extra whitespace and newlines from a string
func cleanstr(s string) string {
	r := regexp.MustCompile(`\s+`)
	s = strings.TrimSpace(s)
	s = strings.Trim(s, "\n")
	s = r.ReplaceAllString(s, " ")

	return s
}

// Get the class attribute of html tag T
func class(t html.Token) string {
	for _, a := range t.Attr {
		if a.Key == "class" {
			return cleanstr(a.Val)
		}
	}
	return ""
}

// Get the href attribute of html tag T
func href(t html.Token) string {
	for _, a := range t.Attr {
		if a.Key == "href" {
			return cleanstr(a.Val)
		}
	}
	return ""
}

// Get the id attribute of html tag T
func id(t html.Token) string {
	for _, a := range t.Attr {
		if a.Key == "id" {
			return cleanstr(a.Val)
		}
	}
	return ""
}

// Get the aria-label (how is this different from id??) attriute of hmtl tag T
func aria_label(t html.Token) string {
	for _, a := range t.Attr {
		if a.Key == "aria-label" {
			return cleanstr(a.Val)
		}
	}
	return ""
}

// Get the book's author and narrator (translator?)
func xPeople(dom *html.Tokenizer) []string {
	var people []string
	var prevtag string
	var pprevtag string
	tt := dom.Next()
	tok := dom.Token()

	for tok.Data != "li" {
		tt = dom.Next()
		tok = dom.Token()

		if tt == html.StartTagToken {
			pprevtag = prevtag
			prevtag = tok.Data
		}

		if tt == html.EndTagToken {
			pprevtag = prevtag
			prevtag = ""
		}

		if pprevtag == "a" && prevtag == "span" && tt == html.TextToken {
			people = append(people, cleanstr(tok.Data))
		}
	}
	return people
}

// Get the book's summary
func xSummary(dom *html.Tokenizer, tt html.TokenType, tok html.Token) string {
	var s string
	for !(tt == html.EndTagToken && tok.Data == "span") {
		tt = dom.Next()
		tok = dom.Token()

		if tt == html.TextToken {
			s += " " + tok.Data + " "
			s = cleanstr(s)
		}
	}
	return s
}

// Get the book's series name and index number
func xSeries(dom *html.Tokenizer, tt html.TokenType, tok html.Token) (string, int) {
	var series string
	var index int
	for !(tt == html.EndTagToken && tok.Data == "span") {
		tt = dom.Next()
		tok = dom.Token()
		if tt == html.TextToken {
			if series == "" {
				series = cleanstr(tok.Data)
			} else if index == 0 {
				s := cleanstr(tok.Data)
				if strings.Contains(s, ", Book") {
					index, _ = strconv.Atoi(s[len(s)-1:])
				}
				break
			}
		}
	}
	return series, index
}