/**
 * Utilities related to network
 * @author Kiminori Matsuzaki
 */

import java.io.*;
import java.util.*;
import java.net.*;
import java.util.regex.*;

class HTMLPageInfo {
  /** The title of the page */
  String title;
  /** Vector of String each of which shows URL */
  Vector pageLists;
}

class NetworkUtil {
  /**
   * try to connect to "url" and read the contents.
   * @param url: URL string.
   * @return An arry of String. The first item specifies the mime-type,
   *         and the second item spcifies the html contents. If the mime-type
   *         is not "text/html", the second item will be empty string.
   *         If connection fails, the return value will be null. 
   */
  static
  String[] connectAndRead( String url ) {
    String mimeType = null;
    StringBuffer pageString = new StringBuffer( );
    try {
      URLConnection connection = new URL( url ).openConnection( );

      mimeType = connection.getContentType( );
      if ( mimeType == null ) return null;
      // If the mime type is "text/html" then read the contents. 
      if ( mimeType.indexOf( "text/html" ) >= 0 ) {
	BufferedReader br
	  = new BufferedReader( new InputStreamReader( connection.getInputStream( ) ) );
	
	String buf;
	while( ( buf = br.readLine( ) ) != null ) {
	  pageString.append( buf );
	}
      }
    }
    catch ( Exception e ) {
      // e.printStackTrace( );
      return null;
    }

    // pack the result and return.
    String[] retval = new String[ 2 ];
    retval[ 0 ] = mimeType;
    retval[ 1 ] = pageString.toString( );
    return retval;
  }

  /**
   * Convert a URL to Absolute Path based on the current URL
   * @param url: A target URL
   * @param baseURL: A base URL whose domain or directory is used
   * @return Absolute URL of url
   */
  static
  String convertToAbsolute( String url, String baseURL ) {
    String str = "";
    try {
      URI uri = new URI( baseURL );
      URI newuri = uri.resolve( url ).normalize( );
      
      str = newuri.toString( );
    }
    catch ( Exception e ) {
    }
    return str;
  }
  
  /**
   * Analyze the html string and get title and anchor (link) addresses.
   * @param pageString: the input html file to analyze.
   * @param baseURL: the base URL which is used to obtain absolute
   *                 URL from relative path. 
   * @return the result of analysis.
   */
  static
  HTMLPageInfo analyzeHTML( String pageString, String baseURL ) {
    HTMLPageInfo info = new HTMLPageInfo( );

    info.title = getTitle( pageString );
    info.pageLists = getAnchors( pageString );
    info.pageLists.addAll( getFrames( pageString ) );

    for ( int i = 0; i < info.pageLists.size( ); i++ ) {
      String absoluteURL
	= convertToAbsolute( ( String ) info.pageLists.get( i ),
			     baseURL );
      info.pageLists.set( i, absoluteURL );
    }
    return info;
  }

  /**
   * Find the title from pageString.
   * @param pageString: A string from which the title is got.
   * @return The title of the HTML string. If there is no title,
   *         "Untitle Document"
   */
  private static
  String getTitle( String pageString ) {
    String titleRegex = "<[tT][iI][tT][lL][eE]>(.*)</[tT][iI][tT][lL][eE]>";
    int titleIndex = 1;
    Pattern pattern = Pattern.compile( titleRegex );
    Matcher matcher = pattern.matcher( pageString );
    if ( matcher.find( ) ) {
      return matcher.group( titleIndex );
    } else {
      return "Untitled Document";
    }
  }

  /**
   * Find anchor tags and collect the URLs
   * @param pageString: A string from which the anchors are got.
   * @return A vector of link URLs.
   */
  private static
  Vector getAnchors( String pageString ) {
    // A regular expression which matches to an anchor. 
    String anchorRegex = "<[aA][^>]*[hH][rR][eE][fF]=\"([^\"]*)\"[^>]*>";
    int urlIndex = 1;

    Vector ret = new Vector( );
    String str = new String( pageString );

    Pattern pattern = Pattern.compile( anchorRegex );
    Matcher matcher = null;
    while( ( matcher = pattern.matcher( str ) ).find( ) ) {
      String newURL = matcher.group( urlIndex );
      str = str.substring( matcher.end( ) );
      ret.add( newURL );
    }
    return ret;
  }

  /**
   * Find frame tag and collect the URLs.
   * @param pageString: A string from which the frame tags are got.
   * @return A vector of link URLs.
   */
  private static
  Vector getFrames( String pageString ) {
    // A regular expression which matches to a frame.
    String frameRegex = "<[fF][rR][aA][mM][eE][^>]*[sS][rR][cC]=\"([^\"]*)\"[^>]*>";
    int urlIndex = 1;

    Vector ret = new Vector( );
    String str = new String( pageString );

    Pattern pattern = Pattern.compile( frameRegex );
    Matcher matcher = null;
    while ( ( matcher = pattern.matcher( str ) ).find( ) ) {
      String newURL = matcher.group( urlIndex );
      str = str.substring( matcher.end( ) );
      ret.add( newURL );
    }
    return ret;
  }
  
  
}