patternpythonMinor
Analyze very large sets of engineering data from Excel files
Viewed 0 times
fromanalyzeexcellargefilesengineeringverysetsdata
Problem
I am an electrical power engineer with some programming skills. My boss asked me to make a program which could analyze very large data, make some calculations and give the result.
The task looks like this:
-
The task was to compose groups. Each group must be unique, and there should be groups that contain starting from 1 element until the maximum possible number of elements. The power of every element must be summed.
I solved it like this:
Program generates Excel files starting from list of 1 element group, till N element group. In each file, there is a list of each group’s elements names and sum of all objects powers.
That was general description of my task and how I handle it. My program works, but it needs days, if not weeks to give the result. I can't use RAM because of very large information, so I do every operation on hard disk. I don't know how to speed up my program. Do you have any idea how to solve my problem, or another approach I can use?
```
coreInput = {}
debug = False
import itertools, os, time, sys
from math import factorial
title = raw_input("Enter input file title: ")
fileName = os.path.dirname(os.path.realpath(__file__)) + "\\" + title
numGens = input("Maximum number of outaged generations (enter 0 for calculation all of them): ")
print ""
totalLines = 0
coreFile = open(fileName)
coreContent = coreFile.readlines()
coreFile.close()
for i in range(0, len(coreContent)):
coreContent[i] = coreContent[i].replace("\n", "")
coreInput[i] = coreContent[i].split(",")
'''
0 1 2 3 4
-----------------------------
N, NAME, P, A, U
The task looks like this:
- I have an Excel file with a list of objects (Number of object, Name, Power, Switched on state decimal fraction, Switched off state decimal fraction).
-
The task was to compose groups. Each group must be unique, and there should be groups that contain starting from 1 element until the maximum possible number of elements. The power of every element must be summed.
I solved it like this:
Program generates Excel files starting from list of 1 element group, till N element group. In each file, there is a list of each group’s elements names and sum of all objects powers.
- The total power of each group in each file must be compared to one another and must be done some calculations. The result will be some percentage for each power. I save this file separately as an final result file.
That was general description of my task and how I handle it. My program works, but it needs days, if not weeks to give the result. I can't use RAM because of very large information, so I do every operation on hard disk. I don't know how to speed up my program. Do you have any idea how to solve my problem, or another approach I can use?
```
coreInput = {}
debug = False
import itertools, os, time, sys
from math import factorial
title = raw_input("Enter input file title: ")
fileName = os.path.dirname(os.path.realpath(__file__)) + "\\" + title
numGens = input("Maximum number of outaged generations (enter 0 for calculation all of them): ")
print ""
totalLines = 0
coreFile = open(fileName)
coreContent = coreFile.readlines()
coreFile.close()
for i in range(0, len(coreContent)):
coreContent[i] = coreContent[i].replace("\n", "")
coreInput[i] = coreContent[i].split(",")
'''
0 1 2 3 4
-----------------------------
N, NAME, P, A, U
Solution
Some suggestions:
So here is how I would write it (ignoring the functions part). I am putting intermediate values inside one
```
from __future__ import division, print_function
import os
from functools import partial
from itertools import chain, combinations
from math import factorial
import pandas as pd
debug = True
# Get paths
file_location = os.path.dirname(os.path.realpath(__file__))
save_folder = "output"
outpath = os.path.join(file_location, save_folder)
if not os.path.exists(outpath):
os.makedirs(outpath)
# Get the path of the file to store the results
# All the results will be stored in this file
outfile = os.path.join(outpath, 'data.h5')
if os.path.exists(outfile):
os.remove(outfile)
#title = raw_input("Enter input file title: ")
title = 'test.csv'
filename = os.path.join(file_location, title)
#n_gens = input("Maximum number of outaged generations (enter 0 for calculation all of them): ")
n_gens = 0
print("")
# Read the input file and save it to our output file
core_input = pd.read_csv(filename, sep='\s+', header=0, index_col=0)
core_input.to_hdf(outfile, 'core_input')
if debug:
print(core_input)
# We don't need the name column anymore
del core_input['NAME']
# Pandas makes this easy
total_u = core_input.U.prod()
n_rows = len(core_input)
if debug:
print("total_u =", total_u)
if n_gens == 0:
n_gens = n_rows
def ncr(n, r):
return factorial(n)/(factorial(r)*factorial(n-r))
total_lines = 0
cnames = ['Total_Power', 'Statistics']
with pd.HDFStore(outfile) as store:
for el_num in range (1, n_gens + 1):
print(el_num, "Gen statistics in progress...")
progress_total = ncr(n_rows, el_num)
# We get all the combinations of row numbers, then use a generator to
# get those rows out of core_input as-needed
ind_combs = combinations(core_input.index, el_num)
dfs = (core_input.loc[inds, :] for inds in ind_combs)
# Get the columns, and put empty data in so we can append later
keyname = 'generators/el_num_{}'.format(el_num)
for i, idf in enumerate(dfs):
ptot = idf.P.sum()
res = idf.A.prod()*total_u/idf.U.prod()
idf2 = pd.DataFrame([[ptot, res]], columns=cnames, index=[i])
store.append(keyname, idf2, data_columns=['Total_Power'])
progressStatus = 100*i/progress_total
print('%3d%%' % progressStatus)
store.create_table_index(keyname)
total_lines += i+1
print(" ", el_num, "Gen statistics Finished")
print("\nAnalyzing results and generating final output file \n")
with pd.HDFStore(outfile) as store:
# Get the columns, and put empty data in so we can append later
keyname = 'results_final'
cnames = ['Power', 'Statistics']
# We use this to keep track of the P values that were already done
doneps = set()
# Get the names of the data stored in the HDF5 file
keypaths = (key for key in store.keys() if '/generators/' in key)
# Get the rows from the previous results
p_select = partial(store.select, columns=['Total_Power'],
iterator=True, chunksize=1)
rows = (p_select(key) for key in keypaths)
rows = chain.from_iterable(rows)
for i, row in enumerate(rows):
ptot = row.iloc[0, 0]
if ptot in doneps:
continue
doneps.add(ptot)
# We use this to filter out values that
- Follow the pep8 style guide
- Use
from __future__ import division, print_function.divisionin particular is important for numerical analysis like this.
- You should be using the pandas python package for this. It is designed for this sort of thing.
- Don't save intermediate or final results in
csvfiles if you can avoid it, they are very slow. At least you should write to acsvfile after everything is done.
- It is better to break your code into functions
- For checking what values you have already done, it is much faster to us a
setwithin.
- Use
os.path.joinfor joining paths.
- You can use something like
'test %s' % 5, or better yet'test {}'.format(5), to put numbers in strings, which is much cleaner than something like'test ' + str(5).
- Always use
withfor opening files.
- You can use
list(range(a, b, c))to convert a list directly to a range without needing to do a loop.
- You can use
a += 1to increment numbers, rather thana = a + 1.
printwrites tostdoutby default, so I don't think you need to specially callsys.stdout.write.
- If you are going to loop over an index you won't use, it is generally considered good from to use
_as a throwaway variable, likefor _ in range(x).
- You can use
continueto skip the rest of the current iteration of aforloop, rather than wrapping most of the loop in aniftest.
- For testing if a string starts with a letter you can just do
mystr[0] == 't'
So here is how I would write it (ignoring the functions part). I am putting intermediate values inside one
HDF5 file, and then saving the final result to a csv file at the very end:```
from __future__ import division, print_function
import os
from functools import partial
from itertools import chain, combinations
from math import factorial
import pandas as pd
debug = True
# Get paths
file_location = os.path.dirname(os.path.realpath(__file__))
save_folder = "output"
outpath = os.path.join(file_location, save_folder)
if not os.path.exists(outpath):
os.makedirs(outpath)
# Get the path of the file to store the results
# All the results will be stored in this file
outfile = os.path.join(outpath, 'data.h5')
if os.path.exists(outfile):
os.remove(outfile)
#title = raw_input("Enter input file title: ")
title = 'test.csv'
filename = os.path.join(file_location, title)
#n_gens = input("Maximum number of outaged generations (enter 0 for calculation all of them): ")
n_gens = 0
print("")
# Read the input file and save it to our output file
core_input = pd.read_csv(filename, sep='\s+', header=0, index_col=0)
core_input.to_hdf(outfile, 'core_input')
if debug:
print(core_input)
# We don't need the name column anymore
del core_input['NAME']
# Pandas makes this easy
total_u = core_input.U.prod()
n_rows = len(core_input)
if debug:
print("total_u =", total_u)
if n_gens == 0:
n_gens = n_rows
def ncr(n, r):
return factorial(n)/(factorial(r)*factorial(n-r))
total_lines = 0
cnames = ['Total_Power', 'Statistics']
with pd.HDFStore(outfile) as store:
for el_num in range (1, n_gens + 1):
print(el_num, "Gen statistics in progress...")
progress_total = ncr(n_rows, el_num)
# We get all the combinations of row numbers, then use a generator to
# get those rows out of core_input as-needed
ind_combs = combinations(core_input.index, el_num)
dfs = (core_input.loc[inds, :] for inds in ind_combs)
# Get the columns, and put empty data in so we can append later
keyname = 'generators/el_num_{}'.format(el_num)
for i, idf in enumerate(dfs):
ptot = idf.P.sum()
res = idf.A.prod()*total_u/idf.U.prod()
idf2 = pd.DataFrame([[ptot, res]], columns=cnames, index=[i])
store.append(keyname, idf2, data_columns=['Total_Power'])
progressStatus = 100*i/progress_total
print('%3d%%' % progressStatus)
store.create_table_index(keyname)
total_lines += i+1
print(" ", el_num, "Gen statistics Finished")
print("\nAnalyzing results and generating final output file \n")
with pd.HDFStore(outfile) as store:
# Get the columns, and put empty data in so we can append later
keyname = 'results_final'
cnames = ['Power', 'Statistics']
# We use this to keep track of the P values that were already done
doneps = set()
# Get the names of the data stored in the HDF5 file
keypaths = (key for key in store.keys() if '/generators/' in key)
# Get the rows from the previous results
p_select = partial(store.select, columns=['Total_Power'],
iterator=True, chunksize=1)
rows = (p_select(key) for key in keypaths)
rows = chain.from_iterable(rows)
for i, row in enumerate(rows):
ptot = row.iloc[0, 0]
if ptot in doneps:
continue
doneps.add(ptot)
# We use this to filter out values that
Code Snippets
from __future__ import division, print_function
import os
from functools import partial
from itertools import chain, combinations
from math import factorial
import pandas as pd
debug = True
# Get paths
file_location = os.path.dirname(os.path.realpath(__file__))
save_folder = "output"
outpath = os.path.join(file_location, save_folder)
if not os.path.exists(outpath):
os.makedirs(outpath)
# Get the path of the file to store the results
# All the results will be stored in this file
outfile = os.path.join(outpath, 'data.h5')
if os.path.exists(outfile):
os.remove(outfile)
#title = raw_input("Enter input file title: ")
title = 'test.csv'
filename = os.path.join(file_location, title)
#n_gens = input("Maximum number of outaged generations (enter 0 for calculation all of them): ")
n_gens = 0
print("")
# Read the input file and save it to our output file
core_input = pd.read_csv(filename, sep='\s+', header=0, index_col=0)
core_input.to_hdf(outfile, 'core_input')
if debug:
print(core_input)
# We don't need the name column anymore
del core_input['NAME']
# Pandas makes this easy
total_u = core_input.U.prod()
n_rows = len(core_input)
if debug:
print("total_u =", total_u)
if n_gens == 0:
n_gens = n_rows
def ncr(n, r):
return factorial(n)/(factorial(r)*factorial(n-r))
total_lines = 0
cnames = ['Total_Power', 'Statistics']
with pd.HDFStore(outfile) as store:
for el_num in range (1, n_gens + 1):
print(el_num, "Gen statistics in progress...")
progress_total = ncr(n_rows, el_num)
# We get all the combinations of row numbers, then use a generator to
# get those rows out of core_input as-needed
ind_combs = combinations(core_input.index, el_num)
dfs = (core_input.loc[inds, :] for inds in ind_combs)
# Get the columns, and put empty data in so we can append later
keyname = 'generators/el_num_{}'.format(el_num)
for i, idf in enumerate(dfs):
ptot = idf.P.sum()
res = idf.A.prod()*total_u/idf.U.prod()
idf2 = pd.DataFrame([[ptot, res]], columns=cnames, index=[i])
store.append(keyname, idf2, data_columns=['Total_Power'])
progressStatus = 100*i/progress_total
print('%3d%%' % progressStatus)
store.create_table_index(keyname)
total_lines += i+1
print(" ", el_num, "Gen statistics Finished")
print("\nAnalyzing results and generating final output file \n")
with pd.HDFStore(outfile) as store:
# Get the columns, and put empty data in so we can append later
keyname = 'results_final'
cnames = ['Power', 'Statistics']
# We use this to keep track of the P values that were already done
doneps = set()
# Get the names of the data stored in the HDF5 file
keypaths = (key for key in store.keys() if '/generators/' in key)
# Get the rows from the previous results
p_select = partial(store.select, columns=['Total_Power'],
Context
StackExchange Code Review Q#102469, answer score: 3
Revisions (0)
No revisions yet.