# -*- coding: utf-8 -*-

"""
Overview:
    This module ( scrape_views.py see version in VERSION ) counts the views of multiple intstructbles and logs to "count" files
    ( one "count" file for each instructable ).
    Control of the instructables counted is via a control file ( name in URL_FILE, often "urllist.txt" )
        of tab seperated values for:
        your title, the url of the instructable, the name of the text "count" file for recording the views
    The "count" log files contains:
        views, timestamp of counting
    See the comments in urllist.txt for more information.

    Author:  russ_hensel, see:  http:www.opencircuits.com/User:Russ_hensel  http://www.instructables.com/member/russ_hensel/
    Download: Instructables: search for Graph Instructable Views with Pypthon Screen Scraping

Environment:
    OS:        Win 7  ( should work on other OSs with Python)
    IDE:       Spyder 2.3.1
    Language:  Python 2.7

Reminders, notes:
    logs to console, watch output
    fair number of commented out debugging statements
    will throw Python console errors on bad data

status/history:

    working, but see enhancement list

    Enhancements ( ! in process * done )
        !improve doc
        *comment lines allowed in url file
        *program log file (name in LOG_FILE)
        !see graph_views.py and coordinate with it
        *announce begin end of program and some constant values used in program
        !error checking still weak
        !look into date data conversion, formatting, etc get things that look more like dates
        *use a configurable subdirectory for saving data
        use a database if data sets get really large
        do not log if value has not gone up
        should make sure dir exists -  check for url list not finding urllist.txt pretty much takes care of this.
        urllib2 in and old out


Scraping ideas from: by http://www.reddit.com/comments/6x4fv/learning_python_by_writing_a_screen_scraper/c053vdu

"""

import sys
import urllib
import urllib2    # https://docs.python.org/2/howto/urllib2.html
import time
import os

# these are pretty much globals -- there are a few variables declared global
# trying to eliminate hardcoded magic numbers

PROG_NAME           = "scrape_views.py"
VERSION             = "2015 Feb 12.1"   # version of the program

COUNT_EXT           = ".txt"         # used as file extension for count.files
LOG_FILE            = "views.log"    # name of the logging file
URL_FILE            = "urllist.txt"  # name of file where urls are kept
SLEEP_BETWEEN_SEC   = 1              # delay time between grabs so as not to stress server

# strings used to locate and parse out the number of views
SEARCH_A    = "<meta itemprop=\"datePublished\""
SEARCH_B    = "content=\"views:"
SEARCH_C    = "\" />"


# globals ( and there may be a few more )
data_dir    = "" # if "" work in same directory as this program, may only work on windows?



def get_args():
    """
    get the argument off the command line
    note that only one is data_dir -- no spaces allowed around the = sign 
    note log file not yet open?
    """

    global data_dir

    for iarg in sys.argv[1:]:
        print iarg
        argsplits   = iarg.split("=")
        parm_name   = argsplits[0]
        parm_value  = argsplits[1]
        print argsplits

        if parm_name == "data_dir":
            data_dir =  parm_value   # 
            print "command line arg >> " + iarg    # log file not open
            #print data_dir
            #self.plot_max_y  = float( parm_value )
            # should make sure dir exists -  check for url list ?

        else:
            pass
            print "Error: arg line " + parm_name + "  " + parm_value

    return


def write_count( filename, count, timestamp ):
    """
    write the count to fileneme, 
    comma seperated values, of the count and timestamp
    return nothing
    """

    msg     = str( count ) + "," + timestamp + "\n"
    outfile = open(  filename, 'a')
    outfile.write(msg)
    outfile.close()
    return


def parseit( apage ):
    """
    parse apage and find the number of views, return as a string
    return "" if parse fails
    """

    location  = apage.find( SEARCH_A   )
    if location > 0:
        data    = apage[location:location + 200]
    else:
        return ""

    location  = data.find( SEARCH_B  )
    if location > 0:
        location = location + len( SEARCH_B )    # could precompute this len
        data  = data[location:location + 200]
        #print data
    else:
        return ""

    location  = data.find( SEARCH_C  )
    if location > 0:
        data    = data[ 0:location ]
        #print data
    else:
        return ""
    # wourld be good to test for other errors ""
    return data


def grab_page( aurl ):
    """
    return the contents of the page at aurl
    return "" if fails
    """

    try:
        web_sock = urllib.urlopen( aurl )
    except (urllib2.HTTPError, urllib2.URLError): # need to fix this
        #https://gist.github.com/fedir/5883651
        #https://docs.python.org/2/howto/urllib2.html
        # check for 404 or other not found error
        logit( "Site Down?: " + aurl ) #
        return ""

    else:
        src = web_sock.read()
        return src


def prog_info():
        """
        log info about program and its argument/enviroment
        nice to have system time and date
        """

        logit( "" )
        logit( "============================" )
        logit( "" )

        logit( "Scraping with " + PROG_NAME + " version = " + VERSION )
        logit( "" )
        
        #  data_dir=russ_data  graph_type=graph_tot
        if len( sys.argv ) == 0:
            logit( "no command line arg " )
        else:
            ix_arg = 0
            for aArg in  sys.argv:

                logit( "command line arg " + str( ix_arg ) + " = " + sys.argv[ix_arg])
                ix_arg += 1

        logit( "current directory " +  os.getcwd() )
        return


def logit( amsg ):
    """
    log amsg ( to console and logfile if any )
    flush so messages do not get lost in error conditions
    """

    global logfile

    logfile.write( amsg + "\n" )
    print amsg
    sys.stdout.flush()
    return


def main():
    """
    main routine
    """

    global logfile
    global url_file_line_ix

    get_args()

    logfile     = open( os.path.join(  data_dir, LOG_FILE   ), "a" ) #opens file for logging  
    logit( "" )
    
    prog_info()

    try:
        sites = open( os.path.join(  data_dir, URL_FILE   ) ).readlines()  
    except IOError:
        logit( "Error: Couldn't find url site file! = " +  os.path.join(  data_dir, URL_FILE   ) )
        logfile.close()
        sys.exit(1) # to drastic?

    url_file_line_ix    = 0

    for isite in sites:

        url_file_line_ix  += 1
        isite  = isite.strip()

        # need error fix up
        isite  = isite.strip()
        if isite == "":
            isite = "#"
        if ( isite[0:1] <> "#" ) and  ( isite[0:6] <> "graph_" ) :   # skip comment lines and graph directives

            splits  = isite.split(",")
            ititle  = splits[0].strip()
            iurl    = splits[1].strip()
            ilfn    = splits[2].strip()      # ilfn = log file name
            ipage   = grab_page( iurl,  )
            if ipage == "":
                logit( "grab failed at line " + str( url_file_line_ix) )
            else:
                viewcount  = parseit( ipage )
                logit( ititle  + "  " + viewcount )

                ts = time.time()
            # print viewcount; sys.stdout.flush()
                
                
            ilfn   =  ilfn + COUNT_EXT
            ilfn   =  os.path.join(  data_dir, ilfn   )
            
            write_count( ilfn, int( viewcount ), str( ts ) )
            time.sleep( SLEEP_BETWEEN_SEC )
        else:
            logit( isite ) # not a url line, comment...
    logit( PROG_NAME + " all done -------  \n"  )
    logit( "" )
    logfile.close()


if __name__ == '__main__':
    
    # the program is intended to run with command line arguments either supplied
    # from the development ide, the command line, or from a batch file
    # if these do not work for you you can "inject" command arguments using a lien
    # like this next one:
    
    # sys.argv = [ "normally filename", "data_dir=test_data"  ]
    
    # to process files in the current directory run with no arguments or data_dir = 
    #sys.argv = [ "normally filename", "data_dir="  ]

    main()




