#!/usr/bin/env runghc
{-# LANGUAGE OverloadedStrings #-}
-- dependencies: libghc-pandoc-dev

-- usage: 'link-extractor.hs [--print-filenames] [file]'; prints out a newline-delimited list of hyperlinks found in
-- targeted Pandoc Markdown .md files (or simple Pandoc-readable HTML .html files) when parsed.
-- Local anchor links are rewritten assuming Gwern.net-style paths of Markdown .md files (ie. a link like `[discriminator ranking](#discriminator-ranking)` in ~/wiki/face.md will be parsed to `/face#discriminator-ranking`). Interwiki links are rewritten to their full URLs.
--
-- If no filename arguments, link-extractor will instead read stdin as Markdown and attempt to parse that instead (falling back to HTML if no URLs are parsed).
-- This makes it easy to pipe in arbitrary sections of pages or annotations, such as `$ xclip -o | runghc -i/home/gwern/wiki/static/build/ /home/gwern/wiki/static/build/link-extractor.hs`.
--
-- Hyperlinks are not necessarily to the WWW but can be internal or interwiki hyperlinks (eg.
-- '/local/file.pdf' or '!W').
-- This reads multiple files and processes them one by one, so is not parallelized, but you can parallelize it at the process level with eg. `parallel --max-args=500 --jobs 30` to use 30 cores, more or less.

module Main where

import Control.Monad (unless)
import Data.List (isSuffixOf)
import qualified Data.Text as T (append,  head, pack, unlines)
import qualified Data.Text.IO as TIO (getContents, readFile, putStr, putStrLn)
import System.Directory (doesFileExist)
import System.Environment (getArgs)
import System.FilePath (takeBaseName)

import Query (extractLinks)

-- | Map over the filenames
main :: IO ()
main = do
  fs <- getArgs
  let printfilename = take 1 fs == ["--print-filenames"]
  let fs' = if printfilename then Prelude.drop 1 fs else fs
  if null fs then do stdin <- TIO.getContents
                     let links = extractLinks True stdin
                     let links' = if links /= [] then links else extractLinks False stdin
                     mapM_ TIO.putStrLn links'
    else mapM_ (printURLs printfilename) fs'

-- | Read 1 file and print out its URLs
printURLs :: Bool -> FilePath -> IO ()
printURLs printfilename file = do
  exists <- doesFileExist file
  unless exists $ error ("A specified file argument is invalid/does not exist? Arguments: " ++ show printfilename ++ " : " ++ file)
  input <- TIO.readFile file
  let converted = extractLinks (".md"`isSuffixOf`file) input
  -- rewrite self-links like "#discriminator-ranking" → "/face#discriminator-ranking" by prefixing the original Markdown filename's absolute-ized base-name;
  -- this makes frequency counts more informative, eg. for deciding what sections to refactor out into standalone pages (because heavy cross-referencing
  -- *inside* a page is an important indicator of a section being 'too big', just like cross-page references are).
  let converted' = map (\u -> if T.head u /= '#' then u else "/" `T.append` T.pack (takeBaseName file) `T.append` u) converted

  if printfilename then TIO.putStr $ T.unlines $ Prelude.map (\url -> T.pack file `T.append` ":" `T.append` url) converted' else
     TIO.putStr $ T.unlines converted'