data_test_develop/parseXmlSaveCsv.py at master · beirving/data_test_develop · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
import csv
import urllib2
import datetime
import collections
import xml.etree.ElementTree as eTree


def download_file(url, file_name='downloaded_xml_data.xml'):
    """
    Download a file from an external source and save locally
    :param url: str
        Valid url for XML
    :param file_name: str
        Set location for file to be downloaded
        Default CWD/downloaded_xml_data.xml
    :return:  str : file location
    """
    source = urllib2.urlopen(url)
    contents = source.read()
    file_handler = open(file_name, 'w')
    file_handler.write(contents)
    file_handler.close()
    return os.path.realpath(file_name)


def check_valid_year(check_date, target_year=2016):
    """
    check if the date given is valid for the target year
    :param check_date: str
    :param target_year: str
    :return: bool
    """
    start = datetime.datetime(target_year, 1, 1)
    end = datetime.datetime(target_year, 12, 31, 23, 59, 59)
    date = datetime.datetime.strptime(check_date, '%Y-%m-%d %H:%M:%S')
    if date > end:
        return False
    if date < start:
        return False
    return True


def check_description(description, check_term=' and '):
    """
    Check if the first argument contains the 2nd argument
    :param description: str
    :param check_term: str
    :return: bool
    """
    if description.find(check_term) >= 0:
        return True
    return False


def listing_valid(listing_iterator_item):
    """
    Helper function to clean up parse_file function
    Runs check_valid_year and check_description under default params
    :param listing_iterator_item: xml.etree.ElementTree.Element
    :return: bool
    """
    listing_details = listing_iterator_item.find('ListingDetails')
    date_result = check_valid_year(listing_details.find('DateListed').text)
    if date_result is False:
        return date_result
    basic_details = listing_iterator_item.find('BasicDetails')
    return check_description(basic_details.find('Description').text)


def join_sub_nodes(main_node, term):
    """
    Outputs a comma separated string of the "term" node values
    :param main_node: xml.etree.ElementTree.Element
    :param term: str
    :return: str
    """
    if main_node is None:
        return ''
    else:
        node_items = main_node.findall(term)
        if node_items is None:
            return ''
        else:
            output_list = []
            for node_item in node_items:
                output_list.append(node_item.text)
        return ','.join(output_list)


def get_get_fields(listing_iterator_item):
    """
    Collect desired node values from xml
    :param listing_iterator_item: xml.etree.ElementTree.Element
    :return: collections.OrderedDict
    """
    output = collections.OrderedDict()
    listing_details = listing_iterator_item.find('ListingDetails')
    location_details = listing_iterator_item.find('Location')
    basic_details = listing_iterator_item.find('BasicDetails')
    rich_details = listing_iterator_item.find('RichDetails')

    output['MlsId'] = listing_details.find('MlsId').text
    output['MlsName'] = listing_details.find('MlsName').text

    output['DateListed'] = listing_details.find('DateListed').text

    output['StreetAddress'] = location_details.find('StreetAddress').text

    output['Price'] = listing_details.find('Price').text
    output['Bedrooms'] = basic_details.find('Bedrooms').text

    # this is producing no values
    # at this point I would talk with stake holders for clarifications on which nodes are important
    # lets talk about the process for getting more information on issues like this
    output['Bathrooms'] = basic_details.find('Bathrooms').text

    # this would be code to switch to use FullBathrooms, HalfBathrooms, ThreeQuarterBathrooms nodes
    # bathrooms = 0
    # bathrooms += int(0 if basic_details.find('FullBathrooms').text is None else basic_details.find('FullBathrooms').text)
    # bathrooms += int(0 if basic_details.find('HalfBathrooms').text is None else basic_details.find('HalfBathrooms').text)
    # bathrooms += int(0 if basic_details.find('ThreeQuarterBathrooms').text is None else basic_details.find('ThreeQuarterBathrooms').text)
    # output['Bathrooms'] = bathrooms
    #   Humorous result:
    #       the listing for 1110 Felbar Avenue has 102 bathrooms
    #       due to the xml having <HalfBathrooms>99</HalfBathrooms>

    output['Appliances'] = join_sub_nodes(rich_details.find('Appliances'), 'Appliance')
    output['Rooms '] = join_sub_nodes(rich_details.find('Rooms'), 'Room')

    # truncate to the 200th character
    description = basic_details.find('Description').text
    output['Description'] = description[0:200]
    return output


def write_listing_to_csv(listing_order_dict, file_name='output'):
    """
    Write the given dict data to the given csv
    :param listing_order_dict: collections.OrderedDict
    :param file_name: str
    :return: void
    """
    file_location = file_name+".csv"
    write_header = True
    if os.path.isfile(file_location) is True:
        write_header = False

    with open(file_location, 'ab') as f:
        writer = csv.writer(f)
        if write_header:
            writer.writerow(listing_order_dict.keys())
        writer.writerow(listing_order_dict.values())


def parse_and_save(local_file_location):
    """
    Iterate through given local xml file line by line and write to default csv location
    :param local_file_location: str
    :return: void
    """
    context = eTree.iterparse(local_file_location)
    for event, element in context:
        if event == "end" and element.tag == 'Listings':
            for listing in element:
                if listing_valid(listing):
                    listing_fields = get_get_fields(listing)
                    write_listing_to_csv(listing_fields)