diff file_io.c @ 0:0ce47c784647

Initial commit
author Merten Sach <msach@mailbox.tu-berlin.de>
date Tue, 27 Sep 2011 15:08:02 +0200
parents
children
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/file_io.c	Tue Sep 27 15:08:02 2011 +0200
     1.3 @@ -0,0 +1,162 @@
     1.4 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
     1.5 +/*   File:         file_io.c                                                 */
     1.6 +/*   Description:  This program reads point data from a file                 */
     1.7 +/*                 and write cluster output to files                         */
     1.8 +/*   Input file format:                                                      */
     1.9 +/*                 ascii  file: each line contains 1 data object             */
    1.10 +/*                 binary file: first 4-byte integer is the number of data   */
    1.11 +/*                 objects and 2nd integer is the no. of features (or        */
    1.12 +/*                 coordinates) of each object                               */
    1.13 +/*                                                                           */
    1.14 +/*   Author:  Wei-keng Liao                                                  */
    1.15 +/*            ECE Department Northwestern University                         */
    1.16 +/*            email: wkliao@ece.northwestern.edu                             */
    1.17 +/*   Copyright, 2005, Wei-keng Liao                                          */
    1.18 +/*                                                                           */
    1.19 +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */
    1.20 +
    1.21 +#include <stdio.h>
    1.22 +#include <stdlib.h>
    1.23 +#include <string.h>     /* strtok() */
    1.24 +#include <sys/types.h>  /* open() */
    1.25 +#include <sys/stat.h>
    1.26 +#include <fcntl.h>
    1.27 +#include <unistd.h>     /* read(), close() */
    1.28 +
    1.29 +#include "kmeans.h"
    1.30 +
    1.31 +#define MAX_CHAR_PER_LINE 128
    1.32 +
    1.33 +
    1.34 +/*
    1.35 +*	Function: file_read
    1.36 +*	-------------------
    1.37 +*	Function for loading input data into memory.
    1.38 +*/
    1.39 +double** file_read(int   isBinaryFile,  /* flag: 0 or 1 */
    1.40 +                  char *filename,      /* input file name */
    1.41 +                  int  *numObjs,       /* count of data objects (local) */
    1.42 +                  int  *numCoords)     /* count of coordinates */
    1.43 +{
    1.44 +    float **objects;
    1.45 +    int     i, j, len;
    1.46 +    ssize_t numBytesRead;
    1.47 +
    1.48 +    if (isBinaryFile) {  /* input file is in raw binary format -------------*/
    1.49 +        int infile;
    1.50 +		fprintf(stderr, "Trying to read from binary file: %s", filename);
    1.51 +        if ((infile = open(filename, O_RDONLY, "0600")) == -1) {
    1.52 +            fprintf(stderr, "Error: Input File Not Found\n");
    1.53 +			exit(EXIT_FAILURE);
    1.54 +        }
    1.55 +        numBytesRead = read(infile, numObjs, sizeof(int));
    1.56 +        assert(numBytesRead == sizeof(int));
    1.57 +        numBytesRead = read(infile, numCoords, sizeof(int));
    1.58 +        assert(numBytesRead == sizeof(int));
    1.59 +
    1.60 +        /* allocate space for objects[][] and read all objects */
    1.61 +        len = (*numObjs) * (*numCoords);
    1.62 +        objects    = (float**)malloc((*numObjs) * sizeof(float*));
    1.63 +        objects[0] = (float*) malloc(len * sizeof(float));
    1.64 +
    1.65 +		if(objects == NULL || objects[0] == NULL) {
    1.66 +			fprintf(stderr, "Could Not Allocate Memory\n");
    1.67 +			exit(EXIT_FAILURE);
    1.68 +		}
    1.69 +
    1.70 +        for (i = 1; i < (*numObjs); i++)
    1.71 +            objects[i] = objects[i-1] + (*numCoords);
    1.72 +
    1.73 +        numBytesRead = read(infile, objects[0], len*sizeof(float));
    1.74 +        assert(numBytesRead == len*sizeof(float));
    1.75 +		fprintf(stderr, " ... Input read successfully!\n");
    1.76 +        close(infile);
    1.77 +
    1.78 +    } else {  /* input file is in ASCII format -------------------------------*/
    1.79 +        FILE *infile;
    1.80 +        char *line, *ret;
    1.81 +        int   lineLen;
    1.82 +
    1.83 +		fprintf(stderr, "Trying to read from ASCII file: %s", filename);
    1.84 +		if ((infile = fopen(filename, "r")) == NULL) {
    1.85 +            fprintf(stderr, "Error: Input File Not Found\n");
    1.86 +			exit(EXIT_FAILURE);
    1.87 +        }
    1.88 +
    1.89 +        /* first find the number of objects */
    1.90 +        lineLen = MAX_CHAR_PER_LINE;
    1.91 +        line = (char*) malloc(lineLen);
    1.92 +        assert(line != NULL);
    1.93 +
    1.94 +        (*numObjs) = 0;
    1.95 +        while (fgets(line, lineLen, infile) != NULL) {
    1.96 +            /* check each line to find the max line length */
    1.97 +            while (strlen(line) == lineLen-1) {
    1.98 +                /* this line read is not complete */
    1.99 +                len = strlen(line);
   1.100 +                fseek(infile, -len, SEEK_CUR);
   1.101 +
   1.102 +                /* increase lineLen */
   1.103 +                lineLen += MAX_CHAR_PER_LINE;
   1.104 +                line = (char*) realloc(line, lineLen);
   1.105 +                assert(line != NULL);
   1.106 +
   1.107 +                ret = fgets(line, lineLen, infile);
   1.108 +                assert(ret != NULL);
   1.109 +            }
   1.110 +
   1.111 +            if (strtok(line, " \t\n") != 0)
   1.112 +                (*numObjs)++;
   1.113 +        }
   1.114 +        rewind(infile);
   1.115 +
   1.116 +        /* find the no. objects of each object */
   1.117 +        (*numCoords) = 0;
   1.118 +        while (fgets(line, lineLen, infile) != NULL) {
   1.119 +            if (strtok(line, " \t\n") != 0) {
   1.120 +                /* ignore the id (first coordiinate): numCoords = 1; */
   1.121 +                while (strtok(NULL, " ,\t\n") != NULL) (*numCoords)++;
   1.122 +                break; /* this makes read from 1st object */
   1.123 +            }
   1.124 +        }
   1.125 +        rewind(infile);
   1.126 +
   1.127 +        /* allocate space for objects[][] and read all objects */
   1.128 +        len = (*numObjs) * (*numCoords);
   1.129 +        objects    = (float**)malloc((*numObjs) * sizeof(float*));
   1.130 +        assert(objects != NULL);
   1.131 +        objects[0] = (float*) malloc(len * sizeof(float));
   1.132 +        assert(objects[0] != NULL);
   1.133 +        for (i=1; i<(*numObjs); i++)
   1.134 +            objects[i] = objects[i-1] + (*numCoords);
   1.135 +
   1.136 +        i = 0;
   1.137 +        /* read all objects */
   1.138 +        while (fgets(line, lineLen, infile) != NULL) {
   1.139 +            if (strtok(line, " \t\n") == NULL) continue;
   1.140 +            for (j=0; j<(*numCoords); j++)
   1.141 +                objects[i][j] = atof(strtok(NULL, " ,\t\n"));
   1.142 +            i++;
   1.143 +        }
   1.144 +		fprintf(stderr, " ... Input read successfully!\n");
   1.145 +        fclose(infile);
   1.146 +        free(line);
   1.147 +    }
   1.148 +
   1.149 +
   1.150 +    double** objects_d = (double**)malloc((*numObjs) * sizeof(double*));
   1.151 +    objects_d[0] = (double*) malloc(len * sizeof(double));
   1.152 +    for (i = 1; i < (*numObjs); i++)
   1.153 +        objects_d[i] = objects_d[i-1] + (*numCoords);
   1.154 +    
   1.155 +    for (i=0; i< (*numObjs); i++){
   1.156 +        for (j=0; j<(*numCoords); j++){
   1.157 +            objects_d[i][j] = objects[i][j];
   1.158 +        }
   1.159 +    }
   1.160 +    free(objects[0]);
   1.161 +    free(objects);
   1.162 +
   1.163 +    return objects_d;
   1.164 +}
   1.165 +